I am trying to build a sign language action recognition model, I have frames that I transformed to landmark keypoints using Mediapipe and they are in .npy format,
Below is my model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
def enhanced_cnn_lstm_model(num_frames=30, num_keypoints=1662, num_classes=502):
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(num_frames, num_keypoints), recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(LSTM(units=256, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(LSTM(units=512, return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model
My data generator
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import numpy as np
import os
tf.config.run_functions_eagerly(True)
class KeypointsDataGenerator(Sequence):
def __init__(self, dataset_dir, batch_size=64, shuffle=True, num_frames=30, num_keypoints=1662):
self.dataset_dir = dataset_dir
self.batch_size = batch_size
self.shuffle = shuffle
self.num_frames = num_frames
self.num_keypoints = num_keypoints
self.samples, self.labels = self._load_dataset()
self.on_epoch_end()
def _load_dataset(self):
samples = []
labels = {}
label_idx = 0
for label_dir in sorted(os.listdir(self.dataset_dir)):
label_path = os.path.join(self.dataset_dir, label_dir)
for sample_file in sorted(os.listdir(label_path)):
sample_path = os.path.join(label_path, sample_file)
if os.path.isfile(sample_path) and sample_path.endswith('.npy'):
samples.append(sample_path)
labels[sample_path] = label_idx
label_idx += 1
if not samples:
print("No samples found.")
return samples, labels
def __len__(self):
return int(np.floor(len(self.samples) / self.batch_size))
def __getitem__(self, index):
batch_samples = self.samples[index * self.batch_size:(index + 1) * self.batch_size]
X, y = self._generate_data(batch_samples)
return X, y
def _generate_data(self, batch_samples):
X = np.zeros((self.batch_size, self.num_frames, self.num_keypoints))
y = np.zeros((self.batch_size), dtype=int)
for i, sample_path in enumerate(batch_samples):
keypoints = np.load(sample_path)
keypoints = keypoints.reshape(self.num_frames, self.num_keypoints)
X[i,] = keypoints
y[i] = self.labels[sample_path]
return X, tf.keras.utils.to_categorical(y, num_classes=502)
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.samples)
Code used to extract keypoints
import numpy as np
def adjust_sequence_length(features_list, target_length=30):
uniform_length = max(len(f) for f in features_list) if features_list else 0
padded_features_list = [np.pad(f, (0, uniform_length - len(f)), 'constant') for f in features_list]
num_frames = len(padded_features_list)
adjusted_features = np.zeros((target_length, uniform_length))
if num_frames > target_length:
indices = np.round(np.linspace(0, num_frames - 1, target_length)).astype(int)
adjusted_features = np.array(padded_features_list)[indices]
elif num_frames < target_length:
repeat_factor = target_length // num_frames
additional_frames_needed = target_length % num_frames
expanded_features = padded_features_list * repeat_factor + padded_features_list[:additional_frames_needed]
adjusted_features[:len(expanded_features), :] = expanded_features
else:
adjusted_features = np.array(padded_features_list)
return adjusted_features
mp_hands = mp.solutions.hands.Hands(static_image_mode=True, max_num_hands=2, min_detection_confidence=0.5)
mp_pose = mp.solutions.pose.Pose(static_image_mode=True, min_detection_confidence=0.5)
mp_face_mesh = mp.solutions.face_mesh.FaceMesh(static_image_mode=True, max_num_faces=1, min_detection_confidence=0.5)
for label_dir in tqdm(os.listdir(dataset_directory)):
gesture_path = os.path.join(dataset_directory, label_dir)
features_label_dir = os.path.join(features_root_directory, label_dir)
os.makedirs(features_label_dir, exist_ok=True)
for sample_dir in os.listdir(gesture_path):
sample_path = os.path.join(gesture_path, sample_dir)
features_list = []
for frame in sorted(os.listdir(sample_path)):
image_path = os.path.join(sample_path, frame)
image = cv2.imread(image_path)
if image is not None:
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
features = extract_features(image_rgb, mp_hands, mp_pose, mp_face_mesh)
if features.size > 0:
features_list.append(features)
adjusted_features = adjust_sequence_length(features_list)
save_path = os.path.join(features_label_dir, f"{sample_dir}.npy")
np.save(save_path, adjusted_features)
mp_hands.close()
mp_pose.close()
mp_face_mesh.close()
I tried data generator with normailztion, but training accuracy gets stuck at 0.2%
from tensorflow.keras.utils import Sequence
import tensorflow as tf
import numpy as np
import os
tf.config.run_functions_eagerly(True)
class KeypointsDataGenerator(Sequence):
def __init__(self, dataset_dir, batch_size=32, shuffle=True, num_frames=30, num_keypoints=1662):
self.dataset_dir = dataset_dir
self.batch_size = batch_size
self.shuffle = shuffle
self.num_frames = num_frames
self.num_keypoints = num_keypoints
self.samples, self.labels = self._load_dataset()
self.min_val, self.max_val = self._find_min_max_values()
self.on_epoch_end()
def _load_dataset(self):
samples = []
labels = {}
label_idx = 0
for label_dir in sorted(os.listdir(self.dataset_dir)):
print(f'Processing label: {label_dir}')
label_path = os.path.join(self.dataset_dir, label_dir)
for sample_file in sorted(os.listdir(label_path)):
sample_path = os.path.join(label_path, sample_file)
if os.path.isfile(sample_path) and sample_path.endswith('.npy'):
samples.append(sample_path)
labels[sample_path] = label_idx
label_idx += 1
return samples, labels
def __len__(self):
return int(np.floor(len(self.samples) / self.batch_size))
def __getitem__(self, index):
batch_samples = self.samples[index * self.batch_size:(index + 1) * self.batch_size]
X, y = self._generate_data(batch_samples)
return X, y
def _find_min_max_values(self):
min_vals = []
max_vals = []
for sample_path in self.samples:
print(f'Processing sample: {sample_path}')
keypoints = np.load(sample_path)
min_vals.append(np.min(keypoints))
max_vals.append(np.max(keypoints))
return np.min(min_vals), np.max(max_vals)
def _generate_data(self, batch_samples):
X = np.zeros((self.batch_size, self.num_frames, self.num_keypoints))
y = np.zeros((self.batch_size), dtype=int)
for i, sample_path in enumerate(batch_samples):
keypoints = np.load(sample_path)
keypoints = keypoints.reshape(self.num_frames, self.num_keypoints)
keypoints = (keypoints - self.min_val) / (self.max_val - self.min_val) # Apply Min-Max scaling
X[i,] = keypoints
y[i] = self.labels[sample_path]
return X, tf.keras.utils.to_categorical(y, num_classes=502)
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.samples)
I tried lower dropout rates and still the same, made the model less and more complex, I tried different learning_rate and still nothing helped.