Thiết kế website giá rẻ

Question

I’m uncertain about how to set the seed for training the model. When I use different seeds, the results vary. Is there a gold standard for choosing a seed, or should I try multiple seeds and select the best results?

Here is my code:

<code>library(MASS) # For glm.nb function

library(dplyr) # For data manipulation

library(tidyverse) # For ggplot2 and other utilities

library(caret) # For data splitting and cross-validation

library(Metrics) # For RMSE, MAE, and R-squared metrics

# Step 1: Define file paths

microbial_abundance_file <- "glm_nb/asv_abundance_all.csv"

scfa_concentration_file <- "glm_nb/acetate_w2.csv"

# Step 2: Load your data

microbial_abundance <- read.csv(microbial_abundance_file)

scfa_concentration <- read.csv(scfa_concentration_file)

# Step 3: Combine the data by SampleID

data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID")

# Step 4: Convert SampleID to a character vector to avoid factor issues

data_combined$SampleID <- as.character(data_combined$SampleID)

# Remove the first column for modeling

data_combined <- data_combined[, -1]

# Step 5: Split data into training and test sets

set.seed(271) # For reproducibility

train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE)

train_data <- data_combined[train_index, ]

test_data <- data_combined[-train_index, ]

# Define control for cross-validation within the training set

cv_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation

# Define the columns to include in the model

columns <- c(

"d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__",

"d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__"

)

# Create the formula dynamically

formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + ")))

# Print the formula to verify

print(formula)

# Train the model with the updated formula

cv_model <- train(

formula,

data = train_data,

method = "glm.nb",

trControl = cv_control

)

# Print cross-validation results

print(cv_model$results)

# Display the final model details

print(cv_model$finalModel)

# Step 7: Evaluate the final model on the test set

predicted_values <- predict(cv_model, newdata = test_data, type = "raw")

actual_values <- test_data$Acetic.acid

residuals <- actual_values - predicted_values

# Calculate performance metrics on the test data

test_rmse <- sqrt(mean(residuals^2))

test_mae <- mean(abs(residuals))

test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2))

cat("Test RMSE:", test_rmse, "n")

cat("Test MAE:", test_mae, "n")

cat("Test R-squared:", test_r_squared, "n")

# Step 8: Plot predicted vs actual SCFA concentration for the test set

ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) +

geom_point() +

geom_smooth(method = "lm", col = "red") +

labs(title = "Predicted vs Actual SCFA Concentration (Test Set)",

x = "Actual SCFA Concentration",

y = "Predicted SCFA Concentration") +

annotate("text", x = min(actual_values), y = max(predicted_values),

label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) +

annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5,

label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) +

annotate("text", x = min(actual_values), y = max(predicted_values) - 1,

label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5)

</code>

<code>library(MASS) # For glm.nb function library(dplyr) # For data manipulation library(tidyverse) # For ggplot2 and other utilities library(caret) # For data splitting and cross-validation library(Metrics) # For RMSE, MAE, and R-squared metrics # Step 1: Define file paths microbial_abundance_file <- "glm_nb/asv_abundance_all.csv" scfa_concentration_file <- "glm_nb/acetate_w2.csv" # Step 2: Load your data microbial_abundance <- read.csv(microbial_abundance_file) scfa_concentration <- read.csv(scfa_concentration_file) # Step 3: Combine the data by SampleID data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID") # Step 4: Convert SampleID to a character vector to avoid factor issues data_combined$SampleID <- as.character(data_combined$SampleID) # Remove the first column for modeling data_combined <- data_combined[, -1] # Step 5: Split data into training and test sets set.seed(271) # For reproducibility train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE) train_data <- data_combined[train_index, ] test_data <- data_combined[-train_index, ] # Define control for cross-validation within the training set cv_control <- trainControl(method = "cv", number = 10) # 10-fold cross-validation # Define the columns to include in the model columns <- c( "d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__", "d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__" ) # Create the formula dynamically formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + "))) # Print the formula to verify print(formula) # Train the model with the updated formula cv_model <- train( formula, data = train_data, method = "glm.nb", trControl = cv_control ) # Print cross-validation results print(cv_model$results) # Display the final model details print(cv_model$finalModel) # Step 7: Evaluate the final model on the test set predicted_values <- predict(cv_model, newdata = test_data, type = "raw") actual_values <- test_data$Acetic.acid residuals <- actual_values - predicted_values # Calculate performance metrics on the test data test_rmse <- sqrt(mean(residuals^2)) test_mae <- mean(abs(residuals)) test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2)) cat("Test RMSE:", test_rmse, "n") cat("Test MAE:", test_mae, "n") cat("Test R-squared:", test_r_squared, "n") # Step 8: Plot predicted vs actual SCFA concentration for the test set ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) + geom_point() + geom_smooth(method = "lm", col = "red") + labs(title = "Predicted vs Actual SCFA Concentration (Test Set)", x = "Actual SCFA Concentration", y = "Predicted SCFA Concentration") + annotate("text", x = min(actual_values), y = max(predicted_values), label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) + annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5, label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) + annotate("text", x = min(actual_values), y = max(predicted_values) - 1, label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5) </code>

library(MASS)        # For glm.nb function
library(dplyr)       # For data manipulation
library(tidyverse)   # For ggplot2 and other utilities
library(caret)       # For data splitting and cross-validation
library(Metrics)     # For RMSE, MAE, and R-squared metrics

# Step 1: Define file paths
microbial_abundance_file <- "glm_nb/asv_abundance_all.csv"
scfa_concentration_file <- "glm_nb/acetate_w2.csv"

# Step 2: Load your data
microbial_abundance <- read.csv(microbial_abundance_file)
scfa_concentration <- read.csv(scfa_concentration_file)

# Step 3: Combine the data by SampleID
data_combined <- inner_join(scfa_concentration, microbial_abundance, by = "SampleID")

# Step 4: Convert SampleID to a character vector to avoid factor issues
data_combined$SampleID <- as.character(data_combined$SampleID)

# Remove the first column for modeling
data_combined <- data_combined[, -1]

# Step 5: Split data into training and test sets
set.seed(271)  # For reproducibility
train_index <- createDataPartition(data_combined$Acetic.acid, p = 0.75, list = FALSE)
train_data <- data_combined[train_index, ]
test_data <- data_combined[-train_index, ]

# Define control for cross-validation within the training set
cv_control <- trainControl(method = "cv", number = 10)  # 10-fold cross-validation

# Define the columns to include in the model
columns <- c(
  "d__Bacteria.p__Firmicutes.c__Clostridia.o__Oscillospirales.f__Ruminococcaceae.g__Faecalibacterium.__",
  "d__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.Selenomonadales.f__Selenomonadaceae.g__Megamonas.__"
)

# Create the formula dynamically
formula <- as.formula(paste("Acetic.acid ~", paste(columns, collapse = " + ")))

# Print the formula to verify
print(formula)

# Train the model with the updated formula
cv_model <- train(
  formula, 
  data = train_data, 
  method = "glm.nb", 
  trControl = cv_control
)

# Print cross-validation results
print(cv_model$results) 

# Display the final model details
print(cv_model$finalModel) 

# Step 7: Evaluate the final model on the test set
predicted_values <- predict(cv_model, newdata = test_data, type = "raw")
actual_values <- test_data$Acetic.acid
residuals <- actual_values - predicted_values

# Calculate performance metrics on the test data
test_rmse <- sqrt(mean(residuals^2))
test_mae <- mean(abs(residuals))
test_r_squared <- 1 - (sum(residuals^2) / sum((actual_values - mean(actual_values))^2))

cat("Test RMSE:", test_rmse, "n")
cat("Test MAE:", test_mae, "n")
cat("Test R-squared:", test_r_squared, "n")

# Step 8: Plot predicted vs actual SCFA concentration for the test set
ggplot(data.frame(actual_values, predicted_values), aes(x = actual_values, y = predicted_values)) +
  geom_point() +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Predicted vs Actual SCFA Concentration (Test Set)",
       x = "Actual SCFA Concentration",
       y = "Predicted SCFA Concentration") +
  annotate("text", x = min(actual_values), y = max(predicted_values), 
           label = paste("RMSE:", round(test_rmse, 2)), hjust = 0, vjust = 1.5) +
  annotate("text", x = min(actual_values), y = max(predicted_values) - 0.5, 
           label = paste("MAE:", round(test_mae, 2)), hjust = 0, vjust = 1.5) +
  annotate("text", x = min(actual_values), y = max(predicted_values) - 1, 
           label = paste("R²:", round(test_r_squared, 2)), hjust = 0, vjust = 1.5)

So far, I’ve found that using seed 271 provides the best result with the highest R-squared and lowest rmse and mae.

Thank you.

Thiết kế website giá rẻ

Danh mục

How do I set the seed for predictive modeling with glm.nb?