Thiết kế website giá rẻ

Question

I am using multiple machine learning models for AQI prediction. Data is in daily format and there are 1850 records. I am getting train accuracies at around 99 and test accuracy at around 91. Is this gap okay? If not, How can i improve my test accuracies?

X = data[['Year', 'Month', 'Day', 'Raw Conc.', 'NowCast Conc.']]
y = data['AQI']

# Data Splitting into training and test sets using time series splitting
data = data.sort_values(by=['Year', 'Month', 'Day'])

# Define your features (X) and target variable (y)
X = data[['Year', 'Month', 'Day', 'Raw Conc.', 'NowCast Conc.']]
y = data['AQI']

# Split data into training and test sets based on a time-based approach
# Example: Use first 80% of data for training, last 20% for testing
split_index = int(len(data) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Parameter grids for each model
param_grids = {
    "Decision Tree": {'max_depth': [3, 5, 7, 10]},
    "Random Forest": {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, 10]},
    "Gradient Boosting": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]},
    "XGBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "CatBoost": {'iterations': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'depth': [3, 5, 7]},
}
# List of models to evaluate
models = [
    ("Decision Tree", DecisionTreeRegressor(random_state=42)),
    ("Random Forest", RandomForestRegressor(random_state=42)),
    ("Gradient Boosting", GradientBoostingRegressor(random_state=42)),
    ("AdaBoost", AdaBoostRegressor(random_state=42)),
    ("XGBoost", XGBRegressor(random_state=42)),
    ("CatBoost", CatBoostRegressor(verbose=0)),
]
model_performance = {}
feature_importance_dict = {}
predictions = {}

for name, model in models:
    param_grid = param_grids[name]
    
    if param_grid:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train_scaled, y_train)
    
    # Calculate predictions
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)

    # Store predictions
    predictions[name] = {'model_name': name, 'y_test_pred': y_test_pred}

    # Calculate evaluation metrics for train set
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    
    # Calculate evaluation metrics for test set
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Store model performance metrics
    model_performance[name] = {
        "Train_RMSE": train_rmse, 
        "Train_R2": train_r2, 
        "Train_MAE": train_mae,
        "Test_RMSE": test_rmse,
        "Test_R2": test_r2,
        "Test_MAE": test_mae
    }
    
    if hasattr(best_model, 'feature_importances_') or hasattr(best_model, 'coef_'):
        feature_importances = best_model.feature_importances_ if hasattr(best_model, 'feature_importances_') else best_model.coef_
        
        # Get feature names
        if isinstance(best_model, (LinearRegression, Ridge, Lasso)):  # For linear models
            feature_names = ['Raw Conc.', 'NowCast Conc.']
        else:  # For other models
            feature_names = ['Raw Conc.', 'NowCast Conc.'] 
        
        # Store feature importances with feature names
        feature_importance_dict[name] = {feature_names[i]: feature_importances[i] for i in range(min(len(feature_importances), len(feature_names)))}

# Convert model performance dictionary to DataFrame
model_performance_df = pd.DataFrame.from_dict(model_performance, orient='index')

# Print model performance
print(model_performance_df)

I tried two approaches:

Dataset set splitting using fixed window time series approach
Dataset splitting using time based approach
Time based approach improved the test accuracies with few points only.

Thiết kế website giá rẻ

Danh mục

How to reduce gap between train and test accuracies for different machine learning models?