The code about forecasting energy. I wrote a code and when I examined it, there seems to be no data leakage. But after a while, the MAPE value increases considerably in its predictions about the future. Can you check for me if there is data leakage in the code I wrote? (I checked from gpt but im not sure there is correct. :))
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
file_path = 'data.xlsx'
data = pd.read_excel(file_path)
data.drop(["Unnamed: 0", "Akşam", "Vardiya_A", "Vardiya_B", "Vardiya_C", "Yeni Çeyrek", "Çeyrek"], axis=1, inplace=True)
data['Tarih'] = pd.to_datetime(data['Tarih'])
data.dropna(inplace=True)
data = pd.get_dummies(data, prefix_sep='_', drop_first=True)
data.columns = pd.Index([col if list(data.columns).count(col) == 1 else f"{col}_{i}" for i, col in enumerate(data.columns)])
X = data.drop(columns=["Enerji", "Tarih"])
y = data['Enerji']
tarih = data['Tarih']
window_size = 3
results_df = pd.DataFrame(columns=['Model', 'Tarih', 'Saat', 'Gerçek Değer', 'Tahmin Edilen Değer', 'MAPE'])
models = [
# ('RF', RandomForestRegressor()),
# ('GBM', GradientBoostingRegressor()),
# ("XGBoost", XGBRegressor(objective='reg:squarederror')),
# ("LightGBM", LGBMRegressor()),
("CatBoost", CatBoostRegressor(verbose=False)),
# ("ExtraTrees", ExtraTreesRegressor())
]
for i in reversed(range(552, 1273)):
X_train = X[:-i].copy()
X_test = X[-i:-i+24].copy()
y_train = y[:-i].copy()
y_test = y[-i:-i+24].copy()
tarih_test = tarih[-i:-i+24].copy()
X_train['Moving_Avg'] = y_train.rolling(window=window_size).mean().shift(1).fillna(method='bfill')
if X_test.empty:
continue
for name, model in models:
model.fit(X_train, y_train)
y_pred = []
y_temp = y_train[-window_size:].tolist()
for j in range(len(X_test)):
X_test_single = X_test.iloc[j:j+1, :].copy()
X_test_single['Moving_Avg'] = np.mean(y_temp[-window_size:]) # Son tahminlere bakarak hareketli ortalamayı güncelle
pred = model.predict(X_test_single)[0]
y_pred.append(pred)
y_temp.append(pred)
y_pred = np.array(y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
temp_df = pd.DataFrame({
'Model': name,
'Tarih': tarih_test.dt.date,
'Saat': tarih_test.dt.time,
'Gerçek Değer': y_test.values,
'Tahmin Edilen Değer': y_pred,
'MAPE': [mape] * len(y_test)
})
results_df = pd.concat([results_df, temp_df], ignore_index=True)
print(f'Kalan Saat: {i}')
plt.figure(figsize=(12, 6))
plt.plot(data['Tarih'], y, label='Gerçek Enerji Tüketimi')
plt.plot(results_df['Tarih'] + pd.to_timedelta(results_df['Saat'].astype(str)), results_df.groupby('Model')['Tahmin Edilen Değer'].mean(), linestyle='--')
plt.xlabel('Tarih')
plt.ylabel('Enerji Tüketimi')
plt.title('Farklı Modellerle Zaman Serisi Tahmini')
plt.legend()
plt.show()
results_df.head()