I’m learning how to use the KNN algorithm in python to forecast stock market prices. The problem I have is that the test_and_predict method which is used to test the model on historical data gives great results, but the train_and_predict method which is used to forecast into the future gives strange results. I have no idea what can cause this behavior.
Below is the part of the application responsible for data preparation, testing and forecasting
<code>def prepare_data(df, weeks_ahead, features, history):
matching_features = find_matching_columns(df, features)
closing_price_col = get_close_matches('Kurs zamknięcia', df.columns, n=1, cutoff=0.6)[0]
date_col = get_close_matches('Data', df.columns, n=1, cutoff=0.6)[0]
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
for i in range(len(df) - max(history, weeks_ahead)):
if i + history + weeks_ahead >= len(df):
X.append(df[matching_features].iloc[i:i+history].values)
X.append(df[matching_features].iloc[i:i+history].values)
y.append(df[closing_price_col].iloc[i+history+weeks_ahead-1])
dates.append(df[date_col].iloc[i+history+weeks_ahead-1])
return np.array(X), np.array(y), np.array(dates)
def test_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
x_testing = X[:len(X)-weeks]
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test = x_testing[:lenX-i], x_testing[lenX-i:lenX-i+1]
y_train, y_test = y_testing[:lenY-i], y_testing[lenY-i:lenY-i+1]
dates_test = dates[lenX-i:lenX-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_true.extend(y_test)
all_y_pred.extend(y_pred)
all_dates.extend(dates_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy_scores = [1 - mape for mape in mape_scores]
"cross_val_scores": accuracy_scores,
"accuracy_scores": accuracy_scores,
"mean_accuracy": np.mean(accuracy_scores),
"std_accuracy": np.std(accuracy_scores),
"random_state": random_state,
plt.figure(figsize=(10, 6))
plt.plot(all_dates, all_y_true, label='True Values', marker='o')
plt.plot(all_dates, all_y_pred, label='Predicted Values', marker='x')
plt.title('True vs. Predicted Values Over Time')
return classifier, scaler, results
def train_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test = X[:len(X)-weeks], X[len(X)-i:len(X)-i+1]
dates_test = dates[len(y)-i:len(y)-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_pred.extend(y_pred)
plt.figure(figsize=(10, 6))
plt.plot( all_y_pred, label='Predicted Values', marker='x')
plt.title('Predicted Values Over Time')
<code>def prepare_data(df, weeks_ahead, features, history):
matching_features = find_matching_columns(df, features)
X = []
y = []
dates = []
closing_price_col = get_close_matches('Kurs zamknięcia', df.columns, n=1, cutoff=0.6)[0]
date_col = get_close_matches('Data', df.columns, n=1, cutoff=0.6)[0]
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
for i in range(len(df) - max(history, weeks_ahead)):
if i + history + weeks_ahead >= len(df):
X.append(df[matching_features].iloc[i:i+history].values)
else:
X.append(df[matching_features].iloc[i:i+history].values)
y.append(df[closing_price_col].iloc[i+history+weeks_ahead-1])
dates.append(df[date_col].iloc[i+history+weeks_ahead-1])
return np.array(X), np.array(y), np.array(dates)
def test_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
if len(X.shape) == 3:
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
x_testing = X[:len(X)-weeks]
y_testing = y[:len(y)]
scaler = StandardScaler()
X = scaler.fit_transform(X)
if k > len(X):
return None, None, None
mape_scores = []
all_y_true = []
all_y_pred = []
all_dates = []
lenX = len(x_testing)
lenY = len(y_testing)
for i in range(1,weeks):
X_train, X_test = x_testing[:lenX-i], x_testing[lenX-i:lenX-i+1]
y_train, y_test = y_testing[:lenY-i], y_testing[lenY-i:lenY-i+1]
dates_test = dates[lenX-i:lenX-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_true.extend(y_test)
all_y_pred.extend(y_pred)
all_dates.extend(dates_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
mape_scores.append(mape)
accuracy_scores = [1 - mape for mape in mape_scores]
results = {
"weeks": weeks,
"cross_val_scores": accuracy_scores,
"accuracy_scores": accuracy_scores,
"mean_accuracy": np.mean(accuracy_scores),
"std_accuracy": np.std(accuracy_scores),
"random_state": random_state,
"y_true": all_y_true,
"y_pred": all_y_pred,
"dates": all_dates
}
plt.figure(figsize=(10, 6))
plt.plot(all_dates, all_y_true, label='True Values', marker='o')
plt.plot(all_dates, all_y_pred, label='Predicted Values', marker='x')
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('True vs. Predicted Values Over Time')
plt.legend()
plt.show()
return classifier, scaler, results
def train_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
if len(X.shape) == 3:
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
scaler = StandardScaler()
X = scaler.fit_transform(X)
if k > len(X):
return None, None, None
mape_scores = []
all_y_true = []
all_y_pred = []
all_dates = []
for i in range(1,weeks):
X_train, X_test = X[:len(X)-weeks], X[len(X)-i:len(X)-i+1]
print(X_test)
y_train = y[:len(y)]
dates_test = dates[len(y)-i:len(y)-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_pred.extend(y_pred)
plt.figure(figsize=(10, 6))
plt.plot( all_y_pred, label='Predicted Values', marker='x')
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Predicted Values Over Time')
plt.legend()
plt.show()
</code>
def prepare_data(df, weeks_ahead, features, history):
matching_features = find_matching_columns(df, features)
X = []
y = []
dates = []
closing_price_col = get_close_matches('Kurs zamknięcia', df.columns, n=1, cutoff=0.6)[0]
date_col = get_close_matches('Data', df.columns, n=1, cutoff=0.6)[0]
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
for i in range(len(df) - max(history, weeks_ahead)):
if i + history + weeks_ahead >= len(df):
X.append(df[matching_features].iloc[i:i+history].values)
else:
X.append(df[matching_features].iloc[i:i+history].values)
y.append(df[closing_price_col].iloc[i+history+weeks_ahead-1])
dates.append(df[date_col].iloc[i+history+weeks_ahead-1])
return np.array(X), np.array(y), np.array(dates)
def test_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
if len(X.shape) == 3:
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
x_testing = X[:len(X)-weeks]
y_testing = y[:len(y)]
scaler = StandardScaler()
X = scaler.fit_transform(X)
if k > len(X):
return None, None, None
mape_scores = []
all_y_true = []
all_y_pred = []
all_dates = []
lenX = len(x_testing)
lenY = len(y_testing)
for i in range(1,weeks):
X_train, X_test = x_testing[:lenX-i], x_testing[lenX-i:lenX-i+1]
y_train, y_test = y_testing[:lenY-i], y_testing[lenY-i:lenY-i+1]
dates_test = dates[lenX-i:lenX-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_true.extend(y_test)
all_y_pred.extend(y_pred)
all_dates.extend(dates_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
mape_scores.append(mape)
accuracy_scores = [1 - mape for mape in mape_scores]
results = {
"weeks": weeks,
"cross_val_scores": accuracy_scores,
"accuracy_scores": accuracy_scores,
"mean_accuracy": np.mean(accuracy_scores),
"std_accuracy": np.std(accuracy_scores),
"random_state": random_state,
"y_true": all_y_true,
"y_pred": all_y_pred,
"dates": all_dates
}
plt.figure(figsize=(10, 6))
plt.plot(all_dates, all_y_true, label='True Values', marker='o')
plt.plot(all_dates, all_y_pred, label='Predicted Values', marker='x')
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('True vs. Predicted Values Over Time')
plt.legend()
plt.show()
return classifier, scaler, results
def train_and_predict(X, y, dates, weeks, k, classifier, random_state, test_size=0.2):
if len(X.shape) == 3:
n_samples, n_time_steps, n_features = X.shape
X = X.reshape(n_samples, n_time_steps*n_features)
scaler = StandardScaler()
X = scaler.fit_transform(X)
if k > len(X):
return None, None, None
mape_scores = []
all_y_true = []
all_y_pred = []
all_dates = []
for i in range(1,weeks):
X_train, X_test = X[:len(X)-weeks], X[len(X)-i:len(X)-i+1]
print(X_test)
y_train = y[:len(y)]
dates_test = dates[len(y)-i:len(y)-i+1]
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
all_y_pred.extend(y_pred)
plt.figure(figsize=(10, 6))
plt.plot( all_y_pred, label='Predicted Values', marker='x')
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Predicted Values Over Time')
plt.legend()
plt.show()
Charts: The first graph shows the results of testing, the second graph shows the results of forecasting:
enter image description here
enter image description here