I’m uncertain about how to set the seed for training the model. When I use different seeds, the results vary. Is there a gold standard for choosing a seed, or should I try multiple seeds and select the best results?
Here is my code:
<code>library(MASS) # For glm.nb function
library(dplyr) # For data manipulation
library(tidyverse) # For ggplot2 and other utilities
library(caret) # For data splitting and cross-validation
library(Metrics) # For RMSE, MAE, and R-squared metrics
# Step 1: Define file paths
microbial_abundance_file <- "glm_nb/asv_abundance_all.csv"
scfa_concentration_file <- "glm_nb/acetate_w2.csv"
# Step 2: Load your data
microbial_abundance <- read.csv(microbial_abundance_file)
scfa_concentration <- read.csv(scfa_concentration_file)
# Step 3: Combine the data by SampleID
data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID")
# Step 4: Convert SampleID to a character vector to avoid factor issues
data_combined$SampleID <- as.character(data_combined$SampleID)
# Remove the first column for modeling
data_combined <- data_combined[, -1]
# Step 5: Split data into training and test sets
set.seed(271) # For reproducibility
train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE)
train_data <- data_combined[train_index, ]
test_data <- data_combined[-train_index, ]
# Define control for cross-validation within the training set
cv_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Define the columns to include in the model
columns <- c(
"d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__",
"d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__"
)
# Create the formula dynamically
formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + ")))
# Print the formula to verify
print(formula)
# Train the model with the updated formula
cv_model <- train(
formula,
data = train_data,
method = "glm.nb",
trControl = cv_control
)
# Print cross-validation results
print(cv_model$results)
# Display the final model details
print(cv_model$finalModel)
# Step 7: Evaluate the final model on the test set
predicted_values <- predict(cv_model, newdata = test_data, type = "raw")
actual_values <- test_data$Acetic.acid
residuals <- actual_values - predicted_values
# Calculate performance metrics on the test data
test_rmse <- sqrt(mean(residuals^2))
test_mae <- mean(abs(residuals))
test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2))
cat("Test RMSE:", test_rmse, "n")
cat("Test MAE:", test_mae, "n")
cat("Test R-squared:", test_r_squared, "n")
# Step 8: Plot predicted vs actual SCFA concentration for the test set
ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
labs(title = "Predicted vs Actual SCFA Concentration (Test Set)",
x = "Actual SCFA Concentration",
y = "Predicted SCFA Concentration") +
annotate("text", x = min(actual_values), y = max(predicted_values),
label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5,
label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 1,
label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5)
</code>
<code>library(MASS) # For glm.nb function
library(dplyr) # For data manipulation
library(tidyverse) # For ggplot2 and other utilities
library(caret) # For data splitting and cross-validation
library(Metrics) # For RMSE, MAE, and R-squared metrics
# Step 1: Define file paths
microbial_abundance_file <- "glm_nb/asv_abundance_all.csv"
scfa_concentration_file <- "glm_nb/acetate_w2.csv"
# Step 2: Load your data
microbial_abundance <- read.csv(microbial_abundance_file)
scfa_concentration <- read.csv(scfa_concentration_file)
# Step 3: Combine the data by SampleID
data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID")
# Step 4: Convert SampleID to a character vector to avoid factor issues
data_combined$SampleID <- as.character(data_combined$SampleID)
# Remove the first column for modeling
data_combined <- data_combined[, -1]
# Step 5: Split data into training and test sets
set.seed(271) # For reproducibility
train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE)
train_data <- data_combined[train_index, ]
test_data <- data_combined[-train_index, ]
# Define control for cross-validation within the training set
cv_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Define the columns to include in the model
columns <- c(
"d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__",
"d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__"
)
# Create the formula dynamically
formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + ")))
# Print the formula to verify
print(formula)
# Train the model with the updated formula
cv_model <- train(
formula,
data = train_data,
method = "glm.nb",
trControl = cv_control
)
# Print cross-validation results
print(cv_model$results)
# Display the final model details
print(cv_model$finalModel)
# Step 7: Evaluate the final model on the test set
predicted_values <- predict(cv_model, newdata = test_data, type = "raw")
actual_values <- test_data$Acetic.acid
residuals <- actual_values - predicted_values
# Calculate performance metrics on the test data
test_rmse <- sqrt(mean(residuals^2))
test_mae <- mean(abs(residuals))
test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2))
cat("Test RMSE:", test_rmse, "n")
cat("Test MAE:", test_mae, "n")
cat("Test R-squared:", test_r_squared, "n")
# Step 8: Plot predicted vs actual SCFA concentration for the test set
ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
labs(title = "Predicted vs Actual SCFA Concentration (Test Set)",
x = "Actual SCFA Concentration",
y = "Predicted SCFA Concentration") +
annotate("text", x = min(actual_values), y = max(predicted_values),
label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5,
label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 1,
label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5)
</code>
library(MASS) # For glm.nb function
library(dplyr) # For data manipulation
library(tidyverse) # For ggplot2 and other utilities
library(caret) # For data splitting and cross-validation
library(Metrics) # For RMSE, MAE, and R-squared metrics
# Step 1: Define file paths
microbial_abundance_file <- "glm_nb/asv_abundance_all.csv"
scfa_concentration_file <- "glm_nb/acetate_w2.csv"
# Step 2: Load your data
microbial_abundance <- read.csv(microbial_abundance_file)
scfa_concentration <- read.csv(scfa_concentration_file)
# Step 3: Combine the data by SampleID
data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID")
# Step 4: Convert SampleID to a character vector to avoid factor issues
data_combined$SampleID <- as.character(data_combined$SampleID)
# Remove the first column for modeling
data_combined <- data_combined[, -1]
# Step 5: Split data into training and test sets
set.seed(271) # For reproducibility
train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE)
train_data <- data_combined[train_index, ]
test_data <- data_combined[-train_index, ]
# Define control for cross-validation within the training set
cv_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation
# Define the columns to include in the model
columns <- c(
"d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__",
"d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__"
)
# Create the formula dynamically
formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + ")))
# Print the formula to verify
print(formula)
# Train the model with the updated formula
cv_model <- train(
formula,
data = train_data,
method = "glm.nb",
trControl = cv_control
)
# Print cross-validation results
print(cv_model$results)
# Display the final model details
print(cv_model$finalModel)
# Step 7: Evaluate the final model on the test set
predicted_values <- predict(cv_model, newdata = test_data, type = "raw")
actual_values <- test_data$Acetic.acid
residuals <- actual_values - predicted_values
# Calculate performance metrics on the test data
test_rmse <- sqrt(mean(residuals^2))
test_mae <- mean(abs(residuals))
test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2))
cat("Test RMSE:", test_rmse, "n")
cat("Test MAE:", test_mae, "n")
cat("Test R-squared:", test_r_squared, "n")
# Step 8: Plot predicted vs actual SCFA concentration for the test set
ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) +
geom_point() +
geom_smooth(method = "lm", col = "red") +
labs(title = "Predicted vs Actual SCFA Concentration (Test Set)",
x = "Actual SCFA Concentration",
y = "Predicted SCFA Concentration") +
annotate("text", x = min(actual_values), y = max(predicted_values),
label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5,
label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) +
annotate("text", x = min(actual_values), y = max(predicted_values) - 1,
label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5)
So far, I’ve found that using seed 271 provides the best result with the highest R-squared and lowest rmse and mae.
Thank you.