I’ve developed a TensorFlow model for an artificial intelligence project, but I’m having a problem with NaN in the loss function during training. Here’s an extract from my code:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import data as dt
print("[Info] Data loaded")
import numpy as np
print("[Info] Numpy loaded")
import random
print("[Info] Random loaded")
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback
print("[Info] Tensorflow loaded")
import atexit
print("[Info] Atexit loaded")
import threading
print("[Info] Threading loaded")
# Add this line after importing TensorFlow
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
# Limit CPU threads
tf.config.threading.set_intra_op_parallelism_threads(0)
tf.config.threading.set_inter_op_parallelism_threads(0)
inputs = dt.train_inputs
outputs = dt.train_outputs
val_inputs = dt.val_inputs
val_outputs = dt.val_outputs
test_inputs = dt.test_inputs
test_outputs = dt.test_outputs
# Check data for NaNs or infinite values
def check_data(data, name):
if np.isnan(data).any() or np.isinf(data).any():
print(f"[Error] {name} contient des NaNs ou des valeurs infinies")
else:
print(f"[Info] {name} est valide")
check_data(inputs, "train_inputs")
check_data(outputs, "train_outputs")
check_data(val_inputs, "val_inputs")
check_data(val_outputs, "val_outputs")
check_data(test_inputs, "test_inputs")
check_data(test_outputs, "test_outputs")
class EThAI:
def __init__(self, save: bool=True, load: bool=dt.values["DoLoad"]):
# Create a lock to synchronise access to the file
self.file_access_lock = threading.Lock()
self.save = bool(save)
self.load = bool(load)
self.build_model()
if self.save:
self.model.save("CryptoAIModel", save_format='tf')
def build_model(self, learning_rate=0.0001):
params = {
'dense_units1': 256.0,
'dense_units2': 64
}
# Model with optimisated hyperparameter
self.model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(33 * 80,)),
tf.keras.layers.Dense(params["dense_units1"], activation='sigmoid'),
tf.keras.layers.Dense(params["dense_units2"], activation='sigmoid'),
tf.keras.layers.Dense(3, activation='softmax')
])
if self.load:
self.model = tf.keras.models.load_model("CryptoAIModel")
# Compilation of the model with the Adam optimizer and the learning rate planner
self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
loss='categorical_crossentropy',
metrics=['accuracy'])
print(f"[Info] Model Build (learning_rate={learning_rate})")
def train(self, epochs: int):
print("[Info] Starting...")
def callback(epoch, _):
self.model.save("CryptoAIModel", save_format='tf')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lambda_callback = LambdaCallback(
on_epoch_end=callback
)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
if dt.values["DoBatch"]:
# Use this list of callbacks when training the model
self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], verbose=1,
validation_data=(val_inputs, val_outputs),
callbacks=[early_stopping, lambda_callback, tensorboard_callback])
else:
self.model.fit(inputs, outputs, epochs=epochs, batch_size=dt.values["BatchSize"], validation_data=(val_inputs, val_outputs), verbose=1,
callbacks=[early_stopping, lambda_callback, tensorboard_callback])
self.model.save("CryptoAIModel", save_format='tf')
print("[Info] Finished !!!")
def predict(self, input):
prediction = self.model.predict(input)
return prediction
def final_output(self, output):
values = output[0]
rounded_values = []
for value in values:
if value < 0.5:
rounded_values.append(0)
else:
rounded_values.append(1)
return rounded_values
def get_random_input_output(self, batch_size=dt.values["DoBatch"]):
num = random.randint(0, len(inputs) - batch_size)
input_data = np.array([inputs[num+i] for i in range(batch_size)])
output_data = np.array([outputs[num+i] for i in range(batch_size)])
return input_data, output_data
if __name__ == '__main__':
AI = EThAI(save=True)
try:
AI.build_model(0.001)
dt.values["DoLoad"] = True
AI.load = True
print("[Info] Training Soon!!!")
AI.train(20)
AI.build_model(0.0001)
AI.train(20)
AI.build_model(0.00001)
AI.train(20)
AI.build_model(0.000001)
AI.train(20)
AI.build_model(0.0000001)
AI.train(20)
dt.values["DoLoad"] = True
except Exception as e:
print(e)
if AI.save:
AI.model.save("CryptoAIModel", save_format='tf')
I have already checked my data for NaN or infinite values before training, but the problem persists. How can I solve this NaN problem in the loss function of my TensorFlow model? I am using TensorFlow version 2.13.0 on Python 3.8.10 with GPU.
Thanks in advance for your help!
Sacha Levatic is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.