So I am competing in a Kaggle competiton (https://www.kaggle.com/competitions/playground-series-s4e8) where we have to predict whether a mushroom is poisonous or not based on the data provided.
The issue I am facing is that my models perform well inside the training and validation sets just fine (around 98-99% accuracy) but they fall apart when I actually submit the final predictions for the competition.
The best accuracy I got until now using the Random forest model was 52% and the rest of my submissions had substantially worse performances. Since the models are performing well inside the notebooks and data with labels,
I assumed that the issue is with the way I am handling data in general because I did not implement techniques like feature engineering and I am not sure if the way I converted categorical data to numeric data works fine or not.
And as mentioned before, I am using the Random Forest Model and/or XGBoost model and these two models are quite well known to be a lot less prone to overfitting than other models.
I also ran multiple iterations of multiple models to find the models with the best parameters (as evident from the code below) so that makes the problem of overfitting less likely.
Here is the code for my data handling:
def dataType(array_like):
return array_like.dtype.name
types = []
for i in range(df_shape[1]):
category = dataType(df_train[df_train.columns.values[i]])
if category == 'object':
types.append(1)
else:
types.append(0)
print(len(types))
print(types)
for t in range(len(types)):
if types[t] == 1:
column = df_train.columns.values[t]
encoder = LabelEncoder()
df_train[column] = encoder.fit_transform(df_train[column])
print(df_train.head())
features_to_scale = ['cap-diameter', 'stem-height', 'stem-width']
scaler = StandardScaler()
df_train[features_to_scale] = scaler.fit_transform(df_train[features_to_scale])
# df_train = df_train.replace('NaN', 0)
X = df_train[['id', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color', 'habitat', 'season']].to_numpy()
Y = df_train['class']
knn_imputer = KNNImputer(n_neighbors=3)
imputed_X = knn_imputer.fit_transform(X)
# X[np.isnan(X)] = 0
# Y[np.isnan(Y)] = 1
X_train, X_test, y_train, y_test = train_test_split(
imputed_X, Y, test_size=0.2, random_state=42)
print(np.isnan(X).sum())
print(np.isnan(imputed_X).sum())
My code for XGBoost implementation and finding the model with the best hyperparameters:
def objective(trial):
param = {
'max_depth': trial.suggest_int('max_depth', 2, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 10.0, log=True),
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'gamma': trial.suggest_float('gamma', 0, 5),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
'reg_lambda': trial.suggest_float('reg_lambda', 1, 5)
}
# Initialize the model with the suggested parameters
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', **param)
# Train the model
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=75)
best_params = study.best_params
best_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', **best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
My code for Random Forest implementation and finding the model with the best hyperparameters:
params = (
(2, 10, 42),
(5, 20, 0),
(10, 30, 123),
(15, 40, 1),
(20, 50, 7),
(2, 15, 99),
(5, 25, 56),
(10, 35, 78),
(15, 45, 100),
(20, 60, 202),
(3, 12, 8),
(6, 18, 15),
(12, 28, 30),
(18, 38, 60),
(25, 50, 90),
(8, 20, 45),
(14, 32, 67)
)
model = RandomForestClassifier(max_depth=35, min_samples_split=10, random_state=78)
model.fit(X_train, y_train)
predictions_train = model.predict(X_train)
predictions_test = model.predict(X_test)
accuracy_train = accuracy_score(y_train, predictions_train)
accuracy_test = accuracy_score(y_test, predictions_test)
print(accuracy_train)
print(accuracy_test)
random_forest_model = model
for min_samples_split, max_depth, random_state in params:
model = RandomForestClassifier(min_samples_split=min_samples_split, max_depth=max_depth, random_state=random_state)
model.fit(X_train, y_train)
predictions_train = model.predict(X_train)
predictions_test = model.predict(X_test)
random_forest_models.append(model)
accuracy_train = accuracy_score(y_train, predictions_train)
accuracy_test = accuracy_score(y_test, predictions_test)
random_forest_accuracies_train.append(accuracy_train)
random_forest_accuracies_test.append(accuracy_test)
print("Model trained")
print(random_forest_accuracies_train)
print(random_forest_accuracies_test)
I then save the model with the highest accuracy (with the least overfitting) and save it in my kaggle notebook. This is my first time in a kaggle competition and my first time building a real ML model from scratch so please help me out!!