I am working on generating recursive one-step-ahead predictions for a time series y using a minimal set of regressors. I have found that linear models all perform similarly and fail to outperform naive baseline models. Therefore, I decided to explore the performance of non-linear models, specifically Random Forest, XGBoost Regression, and LSTM. The results show that Random Forest and LSTM do not outperform the linear models, while XGBoost Regression performs almost perfectly. This indicates that XGBoost is capable of delivering excellent predictions, but it raises questions about why the LSTM cannot (at least) achieve similar results. I suspect there may be an error in my code or in the LSTM training approach.
Here’s an overview of the code:
-
Data Preparation: Split the data into training and testing sets. X Training set is standardized, y not, so that predictions do not have to be rescaled.
-
LSTM Training Function (train_lstm_lagged_pca):
The training set is further split into training and validation sets.
Hyperparameter tuning for the LSTM is conducted using Bayesian Optimization with the keras_tuner package.
The best model parameters are saved globally as best_hps.
(I also tried different values for number of epochs and the max_trials, but this does not improve the performance of the LSTM significantly.) -
Model Initialization: Random Forest and XGBoost Regression models are initialized with minimal hyperparameter optimization.
-
Recursive Forecasting: One-step-ahead forecasts are generated recursively. For each step, the LSTM uses the best hyperparameter combination from the global environment.
-
Results: RMSE values are calculated and the results are plotted.
Prediction results:
Random Forest and LSTM have about the same prediction error, XGBoost by far outperforms them. Why can’t at least LSTM be modified that it comes closer to the actual values when XGBoost basically tracks them perfectly?
RMSE for Random Forest: 0.973292
RMSE for XGBoost Regressor: 0.000598
RMSE for LSTM: 0.976331
Code:
import skglm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sklearn as sk
import plotly.graph_objects as go
import tensorflow as tf
import torch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import TimeSeriesSplit
from itertools import product
from sklearn.linear_model import enet_path, lasso_path
from itertools import cycle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import FormatStrFormatter
from datetime import datetime
from dateutil.relativedelta import relativedelta
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
import statsmodels.api as sm
from statsmodels.formula.api import ols
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from skglm.penalties import SCAD
from skglm import GeneralizedLinearEstimator
from sklearn.metrics import mean_squared_error
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from kerastuner.tuners import BayesianOptimization
import tensorflow as tf
import keras_tuner
# Step 1: Prepare data
# Prepare data
X = pd.DataFrame(np.random.randn(280, 16), columns=[f'feature_{i+1}' for i in range(16)])
# Create DataFrame X from projected_df.values
X = X.values
y = pd.Series(np.random.randn(280), name='target').values
#Split data into training and testing (last 20% for testing)
split_index = int(len(X) * 0.9) # 80% train, 20% test
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_train_reshape = y_train.reshape(-1, 1)
y_test = y[split_index:]
#Step 2: Train LSTM Neural Network: Training is only based on the training data
def build_model(hp):
#Load the data only for LSTM training --> Split the Training set again in training and testing,
#so that LSTM tuning just is based on training data
split_index_lstm = int(len(X_train) * 0.8) # 80% train, 20% test
X_train_lstm = X_train[:split_index_lstm]
X_test_lstm = X_train[split_index_lstm:]
y_train_lstm = y_train[:split_index_lstm]
y_test_lstm = y_train[split_index_lstm:]
nrow_xtrain, ncol_xtrain = X_train_lstm.shape
x_train_data_lstm = X_train_lstm.reshape(1,nrow_xtrain, ncol_xtrain)
nrow_ytrain= y_train_lstm.shape[0]
y_train_data_lstm = y_train_lstm.reshape(1,nrow_ytrain,1)
nrow_ytest= y_test_lstm.shape[0]
y_test_data_lstm = y_test_lstm.reshape(1,nrow_ytest,1)
nrow_xtest, ncol_xtest = X_test_lstm.shape
x_test_data_lstm = X_test_lstm.reshape(1,nrow_xtest, ncol_xtest)
model = Sequential()
model.add(LSTM(hp.Int('input_unit', min_value=1, max_value=512, step=32), return_sequences=True, input_shape=(x_train_data_lstm.shape[1], x_train_data_lstm.shape[2])))
for i in range(hp.Int('n_layers', 1, 6)):
model.add(LSTM(hp.Int(f'lstm_{i}_units', min_value=1, max_value=512, step=32), return_sequences=True))
model.add(LSTM(hp.Int('layer_2_neurons', min_value=1, max_value=512, step=32)))
model.add(Dropout(hp.Float('Dropout_rate', min_value=0, max_value=0.5, step=0.1)))
model.add(Dense(1, activation="linear"))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
return model
def train_lstm_lagged_pca():
#Load the data only for LSTM training --> Split the Training set again in training and testing,
#so that LSTM tuning just is based on training data
split_index_lstm = int(len(X_train) * 0.8) # 80% train, 20% test
X_train_lstm = X_train[:split_index_lstm]
X_test_lstm = X_train[split_index_lstm:]
y_train_lstm = y_train[:split_index_lstm]
y_test_lstm = y_train[split_index_lstm:]
nrow_xtrain, ncol_xtrain = X_train_lstm.shape
x_train_data_lstm = X_train_lstm.reshape(1,nrow_xtrain, ncol_xtrain)
nrow_ytrain= y_train_lstm.shape[0]
y_train_data_lstm = y_train_lstm.reshape(1,nrow_ytrain,1)
nrow_ytest= y_test_lstm.shape[0]
y_test_data_lstm = y_test_lstm.reshape(1,nrow_ytest,1)
nrow_xtest, ncol_xtest = X_test_lstm.shape
x_test_data_lstm = X_test_lstm.reshape(1,nrow_xtest, ncol_xtest)
tuner = BayesianOptimization(
hypermodel=build_model,
objective='mse',
max_trials=2,
)
tuner.search(
x=x_train_data_lstm,
y=y_train_data_lstm,
epochs=1,
batch_size=1,
validation_data=(x_test_data_lstm, y_test_data_lstm),
)
# After tuning, you can get the best hyperparameters and model
models =tuner.get_best_models(num_models=1)
global best_model
best_model = models[0]
global best_hps
best_hps = tuner.get_best_hyperparameters()[0]
return best_model, best_hps
train_lstm_lagged_pca()
# Step 3: Initialize all competing models
# Initialize arrays to store predictions
rf_preds = np.zeros(len(y_test))
xgboost_preds = np.zeros(len(y_test))
lstm_preds = np.zeros(len(y_test))
# Initialize models
rf_model = RandomForestRegressor(random_state=42,n_estimators = 1000)
xg_boost = XGBRegressor(objective='reg:squarederror', n_estimators=10000)
# Step 4: Perform one-step-ahead forecasting
for i in range(len(y_test)):
print(i,(len(X_train)+i))
x_train_data = X[1:(len(X_train)+i+1), :]
y_train_data = y[1:(len(y_train)+i+1)]
nrow_xtrain, ncol_xtrain = x_train_data.shape
x_train_data_lstm = x_train_data.reshape(1,nrow_xtrain, ncol_xtrain)
nrow_ytrain= y_train_data.shape[0]
y_train_data_lstm = y_train_data.reshape(1,nrow_ytrain,1)
nrow_ytest= y_test.shape[0]
y_test_data_lstm = y_test.reshape(1,nrow_ytest,1)
nrow_xtest, ncol_xtest = X_test.shape
x_test_data_lstm = X_test.reshape(1,nrow_xtest, ncol_xtest)
# Random Forest prediction
rf_model.fit(X_train, y_train)
rf_preds[i] = rf_model.predict(X_test[i].reshape(1, -1))
#XGBoost Regressor
xg_boost.fit(x_train_data,y_train_data)
xgboost_preds[i]=xg_boost.predict(X_test[i].reshape(1, -1))
#LSTM Prediction
model = build_model(best_hps)
model.fit(x=x_train_data_lstm,y=y_train_data_lstm,epochs=1)
# Assuming X_test[i] has shape (16,) for example
X_test_reshaped = X_test[i].reshape((1, X_test[i].shape[0])) # Reshape to (1, 16)
# You need to pad or repeat to match the expected sequence
X_test_padded = np.tile(X_test_reshaped, (x_train_data_lstm.shape[1], 1)) # Repeat the single timestep data across 280 time steps
# Now reshape to match the LSTM input shape
X_test_lstm = X_test_padded.reshape((1, x_train_data_lstm.shape[1], x_train_data_lstm.shape[2]))
# Predict with the reshaped data
lstm_preds[i] = model.predict(X_test_lstm)
###
# Step 5: Evaluate all competing models
# Calculate RMSE
rmse_rf = np.sqrt(mean_squared_error(y_test, rf_preds))
rmse_xgboost= np.sqrt(mean_squared_error(y_test, xgboost_preds))
rmse_lstm= np.sqrt(mean_squared_error(y_test, lstm_preds))
print(f"RMSE for Random Forest: {rmse_rf:.6f}")
print(f"RMSE for XGBoost Regressor: {rmse_xgboost:.6f}")
print(f"RMSE for LSTM: {rmse_lstm:.6f}")
# NON-LINEAR: Plotting actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test, label='Actual')
plt.plot(rf_preds, label='Random Forest Predictions')
plt.plot(xgboost_preds, label='XGBoost Regressor Predictions')
plt.plot(lstm_preds, label='LSTM Predictions')
plt.title('Actual vs Non-linear Predicted Values')
plt.legend()
plt.show()
(PS: I know that the code is far from efficient, just getting into it)