I have this dataset
<code>mydata=structure(list(crop_name = c("Guar", "Guar", "Guar", "Guar",
"Guar", "Guar", "Guar", "Guar", "Guar", "Guar", "Bajra", "Bajra",
"Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra",
"Bajra"), B08A = c(2781L, 2817L, 2700L, 1780L, 3702L, 4094L,
3921L, 3780L, 2855L, 3501L, 2963L, 2578L, 1996L, 4502L, 4217L,
3528L, 3689L, 3529L, 3050L, 2996L), nir = c(2576L, 2607L, 2328L,
1541L, 3329L, 3681L, 3636L, 3382L, 2481L, 3174L, 2820L, 2691L,
2318L, 4229L, 3868L, 3648L, 3271L, 3244L, 2497L, 2752L), swir = c(2866L,
3589L, 2782L, 1447L, 2784L, 3084L, 3019L, 2972L, 1779L, 2687L,
3889L, 2332L, 1570L, 2574L, 2910L, 2546L, 2953L, 3176L, 3198L,
3464L), gcvi = c(0.624211853, 1.065768621, 0.897310513, 0.861111111,
1.384670487, 2.349408553, 2.246428571, 1.756316218, 1.753607103,
2.933085501, 0.923601637, 0.996290801, 2.274011299, 2.311667971,
3.084477296, 2.86031746, 2.494658119, 2.164878048, 1.824660633,
1.596226415)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
<code>mydata=structure(list(crop_name = c("Guar", "Guar", "Guar", "Guar",
"Guar", "Guar", "Guar", "Guar", "Guar", "Guar", "Bajra", "Bajra",
"Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra",
"Bajra"), B08A = c(2781L, 2817L, 2700L, 1780L, 3702L, 4094L,
3921L, 3780L, 2855L, 3501L, 2963L, 2578L, 1996L, 4502L, 4217L,
3528L, 3689L, 3529L, 3050L, 2996L), nir = c(2576L, 2607L, 2328L,
1541L, 3329L, 3681L, 3636L, 3382L, 2481L, 3174L, 2820L, 2691L,
2318L, 4229L, 3868L, 3648L, 3271L, 3244L, 2497L, 2752L), swir = c(2866L,
3589L, 2782L, 1447L, 2784L, 3084L, 3019L, 2972L, 1779L, 2687L,
3889L, 2332L, 1570L, 2574L, 2910L, 2546L, 2953L, 3176L, 3198L,
3464L), gcvi = c(0.624211853, 1.065768621, 0.897310513, 0.861111111,
1.384670487, 2.349408553, 2.246428571, 1.756316218, 1.753607103,
2.933085501, 0.923601637, 0.996290801, 2.274011299, 2.311667971,
3.084477296, 2.86031746, 2.494658119, 2.164878048, 1.824660633,
1.596226415)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-20L))
</code>
mydata=structure(list(crop_name = c("Guar", "Guar", "Guar", "Guar",
"Guar", "Guar", "Guar", "Guar", "Guar", "Guar", "Bajra", "Bajra",
"Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra", "Bajra",
"Bajra"), B08A = c(2781L, 2817L, 2700L, 1780L, 3702L, 4094L,
3921L, 3780L, 2855L, 3501L, 2963L, 2578L, 1996L, 4502L, 4217L,
3528L, 3689L, 3529L, 3050L, 2996L), nir = c(2576L, 2607L, 2328L,
1541L, 3329L, 3681L, 3636L, 3382L, 2481L, 3174L, 2820L, 2691L,
2318L, 4229L, 3868L, 3648L, 3271L, 3244L, 2497L, 2752L), swir = c(2866L,
3589L, 2782L, 1447L, 2784L, 3084L, 3019L, 2972L, 1779L, 2687L,
3889L, 2332L, 1570L, 2574L, 2910L, 2546L, 2953L, 3176L, 3198L,
3464L), gcvi = c(0.624211853, 1.065768621, 0.897310513, 0.861111111,
1.384670487, 2.349408553, 2.246428571, 1.756316218, 1.753607103,
2.933085501, 0.923601637, 0.996290801, 2.274011299, 2.311667971,
3.084477296, 2.86031746, 2.494658119, 2.164878048, 1.824660633,
1.596226415)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-20L))
I’m trying to create a classifier that will predict which class of crop (bajra and guar) observation belong to. So the dependent variable is crop_name
, the rest vars are predictors.
But I’m not just trying to create a classifier, I’m trying to automatically enumerate the hyperparameters of a random forest to achieve the accuracy I need (at least 90% by F-measure). Thus, the model must iterate over hyperparameters until the desired accuracy is achieved. (At the same time, if based on these predictors such accuracy is in principle not achievable, then provide the maximum achieved accuracy, for example it can be 70%).
Here are my attempts
# Convert the factor variable to a numeric one
mydata$crop_name <- as.numeric(mydata$crop_name) - 1
# We split the data into training and test samples
trainIndex <- createDataPartition(mydata$crop_name, p = 0.8, list = FALSE)
trainData <- mydata[trainIndex, ]
testData <- mydata[-trainIndex, ]
# Define a function to evaluate the model
evaluate_model <- function(model, testData) {
predictions <- predict(model, testData)
confusionMatrix <- confusionMatrix(data = factor(predictions, levels = c(0, 1)),
reference = factor(testData$crop_name, levels = c(0, 1)),
precision <- confusionMatrix$byClass['Pos Pred Value']
recall <- confusionMatrix$byClass['Sensitivity']
f1 <- 2 * precision * recall / (precision + recall)
return(list(precision = precision, recall = recall, f1 = f1))
# Define a grid of parameters for searching
rfGrid <- expand.grid(mtry = c(2, 3, 4),
ntree = c(500, 1000, 1500))
# Enumerate random forest parameters
for (i in 1:nrow(rfGrid)) {
rfModel <- randomForest(crop_name ~ ., data = trainData,
rfMetrics <- evaluate_model(rfModel, testData)
# Save the model if the F1-measure is better than the previous ones
if (rfMetrics$f1 > best_rf_f1) {
best_rf_f1 <- rfMetrics$f1
cat("Random Forest - F1-measure:", best_rf_f1, "n")
<code>library(caret)
library(randomForest)
# Convert the factor variable to a numeric one
mydata$crop_name <- as.numeric(mydata$crop_name) - 1
# We split the data into training and test samples
set.seed(123)
trainIndex <- createDataPartition(mydata$crop_name, p = 0.8, list = FALSE)
trainData <- mydata[trainIndex, ]
testData <- mydata[-trainIndex, ]
# Define a function to evaluate the model
evaluate_model <- function(model, testData) {
predictions <- predict(model, testData)
confusionMatrix <- confusionMatrix(data = factor(predictions, levels = c(0, 1)),
reference = factor(testData$crop_name, levels = c(0, 1)),
positive = "1")
precision <- confusionMatrix$byClass['Pos Pred Value']
recall <- confusionMatrix$byClass['Sensitivity']
f1 <- 2 * precision * recall / (precision + recall)
return(list(precision = precision, recall = recall, f1 = f1))
}
# **Random Forest**
# Define a grid of parameters for searching
rfGrid <- expand.grid(mtry = c(2, 3, 4),
ntree = c(500, 1000, 1500))
best_rf_f1 <- 0
# Enumerate random forest parameters
for (i in 1:nrow(rfGrid)) {
# Model training
rfModel <- randomForest(crop_name ~ ., data = trainData,
mtry = rfGrid$mtry[i],
ntree = rfGrid$ntree[i])
# Model evaluation
rfMetrics <- evaluate_model(rfModel, testData)
# Save the model if the F1-measure is better than the previous ones
if (rfMetrics$f1 > best_rf_f1) {
best_rf_f1 <- rfMetrics$f1
best_rf_model <- rfModel
}
}
cat("Random Forest - F1-measure:", best_rf_f1, "n")
</code>
library(caret)
library(randomForest)
# Convert the factor variable to a numeric one
mydata$crop_name <- as.numeric(mydata$crop_name) - 1
# We split the data into training and test samples
set.seed(123)
trainIndex <- createDataPartition(mydata$crop_name, p = 0.8, list = FALSE)
trainData <- mydata[trainIndex, ]
testData <- mydata[-trainIndex, ]
# Define a function to evaluate the model
evaluate_model <- function(model, testData) {
predictions <- predict(model, testData)
confusionMatrix <- confusionMatrix(data = factor(predictions, levels = c(0, 1)),
reference = factor(testData$crop_name, levels = c(0, 1)),
positive = "1")
precision <- confusionMatrix$byClass['Pos Pred Value']
recall <- confusionMatrix$byClass['Sensitivity']
f1 <- 2 * precision * recall / (precision + recall)
return(list(precision = precision, recall = recall, f1 = f1))
}
# **Random Forest**
# Define a grid of parameters for searching
rfGrid <- expand.grid(mtry = c(2, 3, 4),
ntree = c(500, 1000, 1500))
best_rf_f1 <- 0
# Enumerate random forest parameters
for (i in 1:nrow(rfGrid)) {
# Model training
rfModel <- randomForest(crop_name ~ ., data = trainData,
mtry = rfGrid$mtry[i],
ntree = rfGrid$ntree[i])
# Model evaluation
rfMetrics <- evaluate_model(rfModel, testData)
# Save the model if the F1-measure is better than the previous ones
if (rfMetrics$f1 > best_rf_f1) {
best_rf_f1 <- rfMetrics$f1
best_rf_model <- rfModel
}
}
cat("Random Forest - F1-measure:", best_rf_f1, "n")
however I get this error
<code>Error in if (rfMetrics$f1 > best_rf_f1) { :
missing value where TRUE/FALSE needed
In addition: Warning message:
In randomForest.default(m, y, ...) :
The response has five or fewer unique values. Are you sure you want to do regression?
<code>Error in if (rfMetrics$f1 > best_rf_f1) { :
missing value where TRUE/FALSE needed
In addition: Warning message:
In randomForest.default(m, y, ...) :
The response has five or fewer unique values. Are you sure you want to do regression?
</code>
Error in if (rfMetrics$f1 > best_rf_f1) { :
missing value where TRUE/FALSE needed
In addition: Warning message:
In randomForest.default(m, y, ...) :
The response has five or fewer unique values. Are you sure you want to do regression?
What am I doing wrong? And how to correctly “force” the model
Iterate through your own hyperparameters until the desired accuracy, or the maximum possible accuracy, is achieved.
As usual, any help is greatly appreciated