I am trying to build a demand forecasting model for taxis. I am unsure how to test and train the model. I want the model to be tested against unseen data, the test data. Is this the correct way to implement it? Does the model work?
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
# Load the dataset
df = pd.read_csv('combined.csv')
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df.set_index('tpep_pickup_datetime', inplace=True)
# Selecting data for a single location, here location ID '132'
data = df['132'].values.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data)
# Function to create sequences
def create_dataset(dataset, look_back=10):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
# Prepare data for LSTM
look_back = 10
X, Y = create_dataset(data_scaled, look_back)
X = np.reshape(X, (X.shape[0], X.shape[1], 1))
# Split data into train, validation, and test sets
train_size = int(len(X) * 0.7)
val_size = int(len(X) * 0.2)
test_size = len(X) - train_size - val_size
trainX, trainY = X[:train_size], Y[:train_size]
valX, valY = X[train_size:train_size+val_size], Y[train_size:train_size+val_size]
testX, testY = X[train_size+val_size:], Y[train_size+val_size:]
# Build the Bidirectional LSTM model
model = Sequential()
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=(look_back, 1)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(50)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
# Use early stopping to halt the training when no improvement
early_stop = EarlyStopping(monitor='val_loss', patience=10)
# Fit the model on training data, validate on validation data
model.fit(trainX, trainY, epochs=10, batch_size=64, verbose=1, validation_data=(valX, valY), callbacks=[early_stop])
# Combine training and validation data
combinedX = np.concatenate((trainX, valX), axis=0)
combinedY = np.concatenate((trainY, valY), axis=0)
# Retrain the model on combined data
model.fit(combinedX, combinedY, epochs=10, batch_size=64, verbose=1, callbacks=[early_stop])
# Predict
train_predict = model.predict(combinedX)
test_predict = model.predict(testX)
# Inverse transformation for plotting
train_predict = scaler.inverse_transform(train_predict)
combinedY_inv = scaler.inverse_transform([combinedY])
test_predict = scaler.inverse_transform(test_predict)
testY_inv = scaler.inverse_transform([testY])
# Plot
plt.figure(figsize=(12, 6))
plt.plot(scaler.inverse_transform(data_scaled), label='Actual')
plt.plot(np.arange(look_back, len(train_predict)+look_back), train_predict, label='Train+Val Predict')
plt.plot(np.arange(len(train_predict)+(2*look_back)+1, len(train_predict)+(2*look_back)+1+len(test_predict)), test_predict, label='Test Predict')
plt.title('Taxi Demand Prediction')
plt.xlabel('Time Interval')
plt.ylabel('Taxi Trips')
plt.legend()
plt.show()
Have played around with what the model learns on, I feel as if the model is too accurate and perhaps there is data leakage
New contributor
James White is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.