I’m working on a problem of predicting the next image from a past image sequence. My starting images are of the form (50, 101, 128, 128, 3), To prepare them for training with an auto-encoder containing CNN layers, I resize them (50, 101, 128, 128, 3) –> (5050, 128, 128, 3) then normalize by X/255 -0.5, then divide into train and val data: X_train size (4545, 128, 128, 3)
X_test size (505, 128, 128, 3). The latent vector is 250.
Training with the autoencoder goes smoothly, and the reconstructed images have very good fidelity to the original image. After training, I encode my images in preparation for training the LSTM model, but after scaling the encoded images and applying the invert.transform, I re-display my images and observe that they are degraded. I’ve tried varying the normalization method but without success. I’ve also used the NDStandardScaler from this link: How to standard scale a 3D matrix?
def build_autoencoder(img_shape, latent_dim):
layer_filters = [32, 64]
kernel_size = 3
#The encoder
inputs = Input(shape=img_shape, name='encoder_input')
x = inputs
# stack of Conv2D(32)-Conv2D(64)
for filters in layer_filters:
x = Conv2D(filters=filters,
kernel_size=kernel_size,
activation='relu',
strides=2,
padding='same')(x)
shape = K.int_shape(x)
# generate latent vector
x = Flatten()(x)
latent = Dense(latent_dim, name='latent_vector')(x)
encoder = Model(inputs, latent, name='encoder')
# The decoder
latent_inputs = Input(shape=(latent_dim,), name='decoder_input')
x = Dense(shape[1] * shape[2] * shape[3])(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)
# stack of Conv2DTranspose(64)-Conv2DTranspose(32)
for filters in layer_filters[::-1]:
x = Conv2DTranspose(filters=filters,
kernel_size=kernel_size,
activation='relu',strides=2, padding='same')(x)
# reconstruct the input
outputs = Conv2DTranspose(filters=3, kernel_size=kernel_size,
activation='linear',
padding='same',
name='decoder_output')(x)
decoder = Model(latent_inputs, outputs, name='decoder')
return encoder, decoder
NB: The loss function converges very well and the reconstructed images have very good fidelity compared with the original images
X_train = X_train.reshape(dataset_train.shape)
encoded_data_train = []
for i in range(dataset_train.shape[0]):
if i%100==0:
print(f"Processing sampling train {i}/{dataset_train.shape[0]}")
encoded_sequence = []
for j in range(dataset_train.shape[1]):
img = X_train[i][j]
encoded_frame = encoder.predict(img[None])[0]
encoded_sequence.append(encoded_frame)
encoded_data_train.append(encoded_sequence)
X_encoded_train = np.asarray(encoded_data_train)
#verification before normalize
def visualize2 (img,code,decoder):
reco=decoder.predict(code[None])[0]
plt.subplot(1,3,1)
plt.title("original")
show_image(img)
plt.subplot(1,3,2)
plt.title("code")
plt.imshow(code.reshape([code.shape[-1]//2, -1]))
plt.subplot(1,3,3)
plt.title("Reconstructed")
show_image(reco)
for i in range(5):
test_index = np.random.choice(range(len(X_encoded_flattened_test)), size=1)[0]
img=X_test[test_index]
code=X_encoded_flattened_test[test_index]
visualize2(img,code,decoder)
plt.show()
#Prepare data for LSTM model
scaler= MinMaxScaler(feature_range=(0,1))
X_1_train_scaled=scaler.fit_transform(X_encoded_flattened_train)
X_1_test_scaled=scaler.transform(X_encoded_flattened_test)
#test after scaled encoded images
for i in range(5):
test_index = np.random.choice(range(len(X_1_test_scaled)), size=1)[0]
img=X_test[test_index]
code=X_1_test_scaled[test_index]
code = scaler.inverse_transform(code.reshape(1, -1)).reshape(-1)
visualize2(img,code,decoder)
plt.show()
N_outputs=5 # predict the last N frames
Apply the processing function to the datasets.
x_train_scaled, y_train_scaled = create_shifted_frames(X_2_train[:, 0:X_2_train.shape[1]:1],N_outputs)
x_val_scaled, y_val_scaled = create_shifted_frames(X_2_test[:, 0:X_2_test.shape[1]:1],N_outputs)
# LSTM model
N_INPUTS=96
N_OUTPUTS=5
N_FEATURES=250
N_BLOCKS=500
optimizer = Adam(learning_rate=0.001)
model = Sequential()
model.add(LSTM(N_BLOCKS, input_shape=(N_INPUTS, N_FEATURES) ))
model.add(RepeatVector(N_OUTPUTS))
model.add(LSTM(N_BLOCKS, return_sequences=True))
model.add(LSTM(N_BLOCKS, return_sequences=True))
model.add(Dense(N_FEATURES))
model.compile(optimizer=optimizer,loss='mse')
NB:
the final loss for training the lstm model converges very well and is less than 0.05 for training and validation.
#plot image after training lstm model
code_size=250
test_index = np.random.choice(range(len(x_val_scaled)), size=1)[0]
test_sequence = x_val_scaled[test_index]
real_images = dataset_test[test_index, -N_outputs:]
# Predicted the next 5 images
predicted_sequence = model.predict(test_sequence[None, :, :])
# Reverse normalization
y_pred = scaler.inverse_transform(predicted_sequence[0])
y_pred =y_pred.reshape(-1,code_size)
decoded_images=[]
for encoded_frame in y_pred:
decoded_frame=decoder.predict(encoded_frame[None])[0]
decoded_images.append(decoded_frame)
decoded_images=np.asarray(decoded_images)
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
#plot real images
for ax, img in zip(axes[0], real_images):
ax.imshow(img)
# plot predicted images
for ax, img in zip(axes[1], decoded_images):
ax.imshow(np.clip(img+0.5,0,1))
plt.show()
enter image description here
I’ve added the codes for the various steps, with a few comments. The real problem lies in image reconstruction after denormalization and training with the lstm model. I don’t know if there’s a more specific procedure when using CNN-type auto-encoders, but it’s worth noting that when I use auto-encoders with dense layers only, I don’t have this problem (although the steps remain the same).
Lavaleur is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.