I have recently trained a predictive model using historical climate data from the file ClimateFlow_Data1950_2021_Daily_spider.xlsx, specifically focusing on the ‘RCP26’ sheet. My aim is to utilize this model to forecast Flow values using data from another file, RCPs_2025_2100.xlsx, with the target sheet being ‘RCPs_2025_2100’.
To provide some context, the predictors in this scenario are Precipitation (PPT) and Maximum Temperature (Tmax), while the target variable I am seeking to predict is Flow.
However, I’ve encountered an issue: the Flow column in the second data file (RCPs_2025_2100.xlsx) is empty, and I need to fill it with the predicted values generated by my model.
Could someone kindly guide me on how to proceed with this task? I’d greatly appreciate any insights or suggestions you might have.
Thank you in advance!
df.head()
Out[6]:
Date PPT Tmax Flow
0 1950-01-01 12:00:00 -0.000092 -3.384451 58.0
1 1950-01-02 12:00:00 2.026306 -4.189279 61.2
2 1950-01-03 12:00:00 -0.000092 -2.875478 63.4
3 1950-01-04 12:00:00 -0.000092 -3.103854 49.0
4 1950-01-05 12:00:00 1.196228 -2.425506 60.9
df2=df = pd.read_excel('C:/LSTM_Python_Climate_streamflow/RCPs_2025_2100.xlsx', sheet_name='RCPs_2025_2100')
df2.head()
Out[8]:
Date PPT Tmax Flow
0 2025-01-01 3.96 -0.87 NaN
1 2025-01-02 2.93 -2.05 NaN
2 2025-01-03 3.01 -1.32 NaN
3 2025-01-04 4.57 -1.67 NaN
4 2025-01-05 2.16 -1.89 NaN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_excel('C:/LSTM_Python_Climate_streamflow/ClimateFlow_Data1950_2021_Daily_spider.xlsx')
df.head()
# Drop rows with missing values
df.dropna(inplace=True)
#Find the correlation
import seaborn as sn
sn.heatmap(df.corr())
training_set = df.iloc[:26298, 1:5].values
test_set = df.iloc[26298:, 1:5].values
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0,1))
training_set_scaled = sc.fit_transform(training_set)
test_set_scaled = sc.fit_transform(test_set)
test_set_scaled = test_set_scaled[:, 0:3]
X_train = []
y_train = []
WS = 24
for i in range(WS, len(training_set_scaled)):
X_train.append(training_set_scaled[i-WS:i, 0:4])
y_train.append(training_set_scaled[i,3])
X_train, y_train = np.array(X_train), np.array(y_train)
#X_train = np.reshape(X_train,(X_train.shape[0], X_train.shape[3], 4))
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2]))
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
Model = Sequential()
Model.add(LSTM(units = 70, return_sequences = True, input_shape = (X_train.shape[1], 4)))
Model.add(Dropout(0.2))
Model.add(LSTM(units = 70, return_sequences = True))
Model.add(Dropout(0.2))
Model.add(LSTM(units = 70, return_sequences = True))
Model.add(Dropout(0.2))
Model.add(LSTM(units = 70))
Model.add(Dropout(0.2))
Model.add(Dense(units = 1))
Model.compile(optimizer = 'adam', loss = 'mean_squared_error')
Model.fit(X_train,y_train, epochs = 5, batch_size = 32)
plt.plot(range(len(Model.history.history['loss'])), Model.history.history['loss'])
plt.xlabel('Epoch Number')
plt.ylabel('Loss')
plt.show()
Model.save('LSTM_ClimateChange_RCP85_24.h5')
from keras.models import load_model
Model = load_model('LSTM_ClimateChange_RCP85_24.h5')
prediction_test = []
Batch_one = training_set_scaled[-24:]
Batch_New = Batch_one.reshape((1,24,4))
for i in range(48):
First_pred = Model.predict(Batch_New)[0]
prediction_test.append(First_pred)
New_var = test_set_scaled[i,:]
New_var = New_var.reshape(1,3)
New_test = np.insert(New_var, 3, [First_pred], axis =1)
New_test = New_test.reshape(1,1,4)
Batch_New = np.append(Batch_New[:,1:,:], New_test, axis=1)
prediction_test = np.array(prediction_test)
SI = MinMaxScaler(feature_range = (0,3))
y_Scale = training_set[:,3:4]
SI.fit_transform(y_Scale)
predictions = SI.inverse_transform(prediction_test)
real_values = test_set[:, 3]
# Plot the first 100 values
plt.plot(real_values[:100], color='black', label='Actual Flow')
plt.plot(predictions[:100], color='blue', label='Predicted Flow')
plt.title('Humber River Flow Prediction')
plt.xlabel('Time (Day)')
plt.ylabel('Flow (M3/s)')
plt.legend()
plt.show()
#### Future data prediction from second data file
df2=df = pd.read_excel('C:/LSTM_Python_Climate_streamflow/RCPs_2025_2100.xlsx', sheet_name='RCPs_2025_2100')
df2.head()
df2.columns = df.columns.str.strip()
df2.index = pd.to_datetime(df['Date'], format='%Y.%m.%d')
df2.head()
print(df2.columns)
flow = df2['Flow']
def df_to_X_y(df2, window_size=5):
df_as_np = df.to_numpy()
X = []
y = []
for i in range(len(df_as_np)-window_size):
row = [[a] for a in df_as_np[i:i+window_size]]
X.append(row)
label = df_as_np[i+window_size]
y.append(label)
return np.array(X), np.array(y)
WINDOW_SIZE = 5
X1, y1 = df_to_X_y(flow, WINDOW_SIZE)
X1.shape, y1.shape
X_pred, y_pred = X1[:20000], y1[:20000]
X_pred.shape, y_pred.shape,
predictions = model1.predict(X_pred)
df2.index = pd.to_datetime(df2['Date'])
df2.drop(columns=['Date'], inplace=True)
def df_to_sequences(df2, window_size=5):
sequences = []
for i in range(len(df) - window_size):
sequence = df.iloc[i:i + window_size]
sequences.append(sequence)
return sequences
sequences = df_to_sequences(df2)
X_new = np.array(sequences)
#X_new = X_new.reshape(X_new.shape[0], X_new.shape[1], X_new.shape[2])
predictions = model1.predict(X_new)
print("X_new shape:", X_new.shape) # Debugging statement
predictions = model1.predict(X_new)
[enter image description here](https://i.sstatic.net/vQzzPSo7.jpg)