I’m trying to train my model for mushroom classification, but when I try to train the model it gives outputs an unknown graph execution model. When looking at the error, is said that the GPU wasn’t able to execute the best cudnn conv algorithm because it ran out of memory. I tried to use a smaller model but it didn’t fixed the problem. Can anyone help me figure out what went wrong?
This is my code:
import os
import shutil
import time
import numpy as np
import cv2
from glob import glob
from matplotlib import pyplot
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras import layers as L
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
img_height = 512
img_width = 512
batch_size = 32
data_dir = "/kaggle/input/mushrooms-images-classification-215/data/data"
train_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.05,
subset="training",
seed=123,
image_size=(img_height, img_width),
batch_size=batch_size)
val_ds = tf.keras.utils.image_dataset_from_directory(
data_dir,
validation_split=0.05,
subset="validation",
seed=123,
image_size=(img_height, img_width),
batch_size=batch_size)
num_classes = 215
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape = (512, 512, 3), batch_size = batch_size),
tf.keras.layers.Conv2D(256, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(128, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(64, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(32, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Conv2D(16, 3, activation='relu'),
tf.keras.layers.MaxPooling2D(),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(num_classes)
])
model.compile(
optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=['accuracy'])
history = model.fit(train_ds, validation_data=val_ds, epochs=10)
This is the error:
UnknownError: Graph execution error:
Detected at node StatefulPartitionedCall defined at (most recent call last):
File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start
File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell
File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes
File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
File "/tmp/ipykernel_36/1248756199.py", line 1, in <module>
File "/opt/conda/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/opt/conda/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 314, in fit
File "/opt/conda/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 117, in one_step_on_iterator
Failed to determine best cudnn convolution algorithm for:
%cudnn-conv-bias-activation.15 = (f32[32,256,510,510]{3,2,1,0}, u8[0]{0}) custom-call(f32[32,3,512,512]{3,2,1,0} %transpose.86, f32[256,3,3,3]{3,2,1,0} %transpose.87, f32[256]{0} %arg3.4), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBiasActivationForward", metadata={op_type="Conv2D" op_name="sequential_1/conv2d_1/convolution" source_file="/opt/conda/lib/python3.10/site-packages/tensorflow/python/framework/ops.py" source_line=1177}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kRelu","side_input_scale":0,"leakyrelu_alpha":0}}
Original error: RESOURCE_EXHAUSTED: Out of memory while trying to allocate 8539734016 bytes.
To ignore this failure and try to use a fallback algorithm (which may have suboptimal performance), use XLA_FLAGS=--xla_gpu_strict_conv_algorithm_picker=false. Please also file a bug for the root cause of failing autotuning.
[[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_2682]
5