My ConvLSTM model for predicting sign language gestures trained on WLASL video dataset is not improving accuracy during training phase for 10 epochs

Link for the dataset – https://www.kaggle.com/datasets/risangbaskoro/wlasl-processed

This is preprocess dataset code.

# IMPORTS
import os
import sys
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2
import numpy as np
# import matplotlib.pyplot as plt
# import torch.nn as nn
# from sklearn.metrics import classification_report
import time
# from torchvision import models

####
import json
import cv2
import random
# from torch.cuda.amp import autocast, GradScaler


# Transformations to apply to each frame
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
transformations = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),  # Scale to range [0, 1]
    v2.Normalize(mean=mean, std=std)  # doing mean and std for dataset using pytorch Imagenet dataset values; ResNet was trained on this
])


def preprocess_videos(bbox, fps, frame_end, frame_start, video_path):
    cap = cv2.VideoCapture(video_path)

    duration_frames = 1 * 30  # 2 sec, 30 fps = 60

    if not cap.isOpened():
        print("[INFO] Warning: Could not open video." + video_path)
        return

    frames = []
    current_frame = 1

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if (current_frame >= frame_start) and (frame_end == -1 or current_frame <= frame_end):
            # Apply bounding box
            x_min, y_min, x_max, y_max = bbox
            cropped_frame = frame[y_min:y_max, x_min:x_max]

            # Resize frame
            output_frame_size = (500, 500)  # Arbitrary
            resized_frame = cv2.resize(cropped_frame,
                                       output_frame_size)  ## I can specify the interpolation method for better resizing such as BiCubic, Bilinear, etc.

            # Randomly flipping
            if random.random() > 0.5:
                resized_frame = cv2.flip(resized_frame, 0)  # 1 means flipping around y-axis # 0 means x-axis

            # Randomly rotate the frame
            rotation_choices = [0, cv2.ROTATE_90_CLOCKWISE, cv2.ROTATE_180, cv2.ROTATE_90_COUNTERCLOCKWISE]
            rotation = random.choice(rotation_choices)
            if rotation != 0:
                resized_frame = cv2.rotate(resized_frame, rotation)

            # Convert cv2 BGR to RGB format
            resized_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)

            # Transpose the frame from [height, width, channels] to [channels, height, width]
            resized_frame = np.transpose(resized_frame, (2, 0, 1))  # This format is PyTorch Native

            # Tensor casting
            # Maybe no need for numpy to tensor as transformations take ndarray directly.
            tensor_frame = torch.from_numpy(resized_frame)  # Change numpy array to tensor # No need for .float() here as I am using transform above
            frame_transformed = transformations(tensor_frame)  # Apply transformations

            frames.append(frame_transformed)

        current_frame += 1

    cap.release()

    if len(frames) < duration_frames:
        # If fewer frames than desired, repeat the last frame
        frames += [frames[-1]] * (duration_frames - len(frames))
    elif len(frames) > duration_frames:
        # If more frames than desired, sample evenly from the extracted frames
        indices = torch.linspace(0, len(frames) - 1, duration_frames, dtype=torch.int)
        frames = [frames[i] for i in indices]

    return frames


class VideoSignLanguageDataset(Dataset):
    def __init__(self, df, root):
        self.data_frame = df
        # self.transform = transform
        self.root_path = root

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        metadata = self.data_frame.iloc[idx]
        bbox = metadata['bbox']
        fps = metadata['fps']
        frame_end = metadata['frame_end']
        frame_start = metadata['frame_start']
        split = metadata['split']
        video_id = metadata['video_id']
        gloss = metadata['gloss']
        # label = self.data_frame.iloc[idx, -1]
        label = metadata['labels']
        video_path = f'{self.root_path}/{video_id}.mp4'

        frames = preprocess_videos(bbox, fps, frame_end, frame_start, video_path)

        # label = torch.tensor(label).long()  ## Both lines are almost same
        label = torch.tensor(label, dtype=torch.int64)

        if frames:
            # .stack() RULE: Need to ensure all frames are of the same shape and tensor type for stacking
            tensor_frames = torch.stack(frames)

            # if self.transform:
            #   frames = self.transform(frames)
            # print("tf_type = ", tensor_frames.dtype)
            # print("tf_size = ", tensor_frames.size())
            return tensor_frames, label


# Custom collate function to remove None values
def custom_collate_fn(batch):
    batch = list(filter(lambda x: x is not None, batch))  # Filter out None values

    if not batch:
        # Return None or a custom signal that indicates an empty batch
        return None  ## This return None is not very good, I have to handle this better so it returns something default_collate() can work with.

    return torch.utils.data.dataloader.default_collate(batch)


if __name__ == '__main__':
    print(torch.__version__)
    print(torch.cuda.get_device_name(0))

    DEVICE = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")  ## No need for cuda:0 here as it only has one GPU and 0 is default
    print(DEVICE)
    tot_mem = torch.cuda.get_device_properties(DEVICE).total_memory
    resv_mem = torch.cuda.memory_reserved(DEVICE)
    alloc_mem = torch.cuda.memory_allocated(DEVICE)
    print(tot_mem, resv_mem, alloc_mem, sep="/")

    # Set RNGs to same values every time including CUDA operations
    seed = 10
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) ## For multiple GPUs

    # Load the metadata from the JSON file
    with open('WLASL_v0.3.json', 'r') as file:
        metadata = json.load(file)

    folder_path = r"C:Users2811rOneDriveDesktoparchive"
    ROOT = folder_path + "/videos"
    file_extension = ".mp4"

    # Define a function to check if the file exists
    def check_file_exists(row):
        file_name = f"{row['video_id']}{file_extension}"
        file_path = os.path.join(ROOT, file_name)
        return os.path.exists(file_path)


    labels = {'book': 0, 'drink': 1, 'computer': 2, 'before': 3, 'chair': 4, 'go': 5}
    len_labels = len(labels)
    print(len_labels)

    # Dataframe -start
    df = pd.DataFrame()

    for header in metadata:
        action = header['gloss']
        if action in labels.keys():
            temp = pd.json_normalize(header, record_path=['instances'], meta=['gloss']).drop(
                ['source', 'variation_id', 'url', 'signer_id', 'instance_id'], axis=1)
            temp['labels'] = labels[action]
            # Remove rows that has missing video ids
            mask = temp.apply(check_file_exists, axis=1)  ## axis 1 is row
            temp = temp[mask]
            # Concat the dataframes
            df = pd.concat([df, temp])

    print(df)  # with truncation
    # print(df.to_string())  # This is for displaying entire dataframe without truncation

    df.to_pickle("./final_dataframe.pkl")

    # BATCH_SIZE = 32
    # training_dataset = VideoSignLanguageDataset(df=df, root=ROOT)
    # train_dataloader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4,
    #                               collate_fn=custom_collate_fn)
    #
    #
    # for X, y in train_dataloader:
    #     print('X :::: ', X)
    #     print('y :::: ', y)
    #     break

    # print(y_batch)
    # unique_tensor = torch.unique(y_batch, return_counts=True)
    # print(unique_tensor, len(unique_tensor))

This is code for the model.

# IMPORTS
import torch
from torchvision import models
import torch.nn as nn
####


class ConvLSTMNet(nn.Module):
  def __init__(self, input_channels, output_classes):
    super().__init__()

    self.resnet_cnn_model = models.resnet18(weights='ResNet18_Weights.DEFAULT')

    # Replace the final fully connected layer of ResNet with an Identity layer
    # This is because we only want to use ResNet as a feature extractor
    self.resnet_cnn_model.fc = nn.Identity()
    # The output size from ResNet will be [batch_size, 512, 1, 1]
    # self.input_size = 512  # Based on ResNet

    # # This snippet is for programmatically determining the cnn output to serve as input for LSTM
    dummy_input = torch.randn(1, input_channels, 100, 100)  ## [batch_size, channels, height, width]
    dummy_output = self.resnet_cnn_model(dummy_input)
    cnn_output = dummy_output.view(dummy_output.size(0), -1).size(1)  # This would give 512 as its output

    # LSTM Parameters
    self.output_classes = output_classes  # number of output classes # 27 is num of actions available
    self.num_layers = 2  # number of stacked layers of LSTM
    #self.input_size = 512
    self.input_size = cnn_output
    self.hidden_size = 1024  # Arbitrary
    # self.seq_length = 60  # Frames length ## Maybe use dummy_output.size(1)

    self.LSTM = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
    # Maybe add few more dense layers to improve accuracy
    self.fc_1 = nn.Linear(in_features=self.hidden_size, out_features=512)
    self.relu_1 = nn.ReLU()
    self.fc_2 = nn.Linear(in_features=512, out_features=256)
    self.relu_2 = nn.ReLU()
    self.fc_3 = nn.Linear(in_features=256, out_features=128)
    self.relu_3 = nn.ReLU()
    self.fc_4 = nn.Linear(in_features=128, out_features=64)
    self.relu_4 = nn.ReLU()
    self.fc_5 = nn.Linear(in_features=64, out_features=self.output_classes)

  def forward(self, x):
    # print(f"Original X shape: {x.shape}")  # Debugging
    batch_size, seq_length, c, h, w = x.size()
    
    x = x.view(-1, c, h, w) ## Converting 5D tensor to 4D as CNN expects
    # print(f"Reshaped X for ResNet shape: {x.shape}")  # Debugging
    x = self.resnet_cnn_model(x)  ## Feeding into ResNet CNN
    # print(f"Output from ResNet shape: {x.shape}")  # Debugging

    # Reshape CNN output for LSTM input
    x = x.view(batch_size, seq_length, -1)  ## [batch_size, frames_of_video, features]
    # print(f"Reshaped X for LSTM shape: {x.shape}")  # Debugging

    # Single Video Processing
    outputs = []
    for i in range(batch_size):
      # Extract the i-th video in the batch
      video_seq = x[i].unsqueeze(0)

      # Reset hidden states for each batch
      h_0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(x.device)
      c_0 = torch.zeros(self.num_layers, 1, self.hidden_size).to(x.device)

      lstm_out, (hn, cn) = self.LSTM(video_seq, (h_0, c_0))  ## Feed into LSTM
      # Get the output from the last time step
      last_time_step_output = lstm_out[:, -1, :]
      outputs.append(last_time_step_output)


    x = torch.cat(outputs, dim=0)

    # x = lstm_out[:, -1, :]

    # Feed LSTM 'extracted' output to the following dense layers
    x = self.fc_1(x)
    x = self.relu_1(x)
    x = self.fc_2(x)
    x = self.relu_2(x)
    x = self.fc_3(x)
    x = self.relu_3(x)
    x = self.fc_4(x)
    x = self.relu_4(x)
    x = self.fc_5(x)

    return x

if __name__ == '__main__':
  model = ConvLSTMNet(input_channels=3, output_classes=3)
  print(model)

This is training script.

import os
from torch.cuda.amp import GradScaler, autocast
import model
import data_preprocess
from data_preprocess import VideoSignLanguageDataset
import torch
from model import ConvLSTMNet
from torchvision import models
import torch.nn as nn
import time
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2

if __name__ == "__main__":

    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

    print(torch.__version__)
    print(torch.cuda.get_device_name(0))

    DEVICE = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu")  ## No need for cuda:0 here as it only has one GPU and 0 is default
    print(DEVICE)
    tot_mem = torch.cuda.get_device_properties(DEVICE).total_memory
    resv_mem = torch.cuda.memory_reserved(DEVICE)
    alloc_mem = torch.cuda.memory_allocated(DEVICE)
    print(tot_mem, resv_mem, alloc_mem, sep="/")

    # Set RNGs to same values every time including CUDA operations
    seed = 10
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed) ## For multiple GPUs

    # Creating instance of model
    input_channels = 3  # RGB
    output_classes = 6  # ASL # 27 is num of actions available in the dataset
    conv_lstm_model = ConvLSTMNet(input_channels, output_classes).to(DEVICE)
    print(conv_lstm_model)
    print(next(conv_lstm_model.parameters()).device)

    # Loss function and Optimizer
    scaler = GradScaler()
    LRN_RATE = 0.001
    loss_function = nn.CrossEntropyLoss()  # This loss function itself does LogSoftmax (+ NLLLoss)
    g_descent_optimizer = torch.optim.Adam(conv_lstm_model.parameters(),
                                           lr=LRN_RATE)  ##Adam is a type of gradient descent

    # Training Model
    overall_train_losses = []
    overall_train_accuracy = []
    # test_losses = []
    # test_correct = []

    folder_path = r"C:Users2811rOneDriveDesktoparchive"
    ROOT = folder_path + "/videos"
    file_extension = ".mp4"

    df = pd.read_pickle("final_dataframe.pkl")
    BATCH_SIZE = 15
    training_dataset = VideoSignLanguageDataset(df=df, root=ROOT)
    train_dataloader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=3,
                                  collate_fn=data_preprocess.custom_collate_fn)

    # Train Start
    EPOCHS = 50
    total_dataset = len(train_dataloader.dataset)
    print("Total Dataset: ", total_dataset)

    print("[INFO] Training the network...")
    start_time = time.time()

    for e in range(EPOCHS):
        single_epoch_st_time = time.time()
        conv_lstm_model.train()  ## Set model to train mode

        # These variables are for entire dataset once (all batches)
        total_train_loss = 0
        total_train_correct = 0

        for X_batch, y_batch in train_dataloader:

            # Push Data Tensors to the GPU
            (X_batch, y_batch) = X_batch.to(DEVICE), y_batch.to(DEVICE)

            g_descent_optimizer.zero_grad()

            with autocast():  ## Maybe no need to write arguments here; could just leave ()
                pred = conv_lstm_model(X_batch)  ## Predicted values of y
                loss = loss_function(pred, y_batch)

            # g_descent_optimizer.zero_grad()

            scaler.scale(loss).backward()
            # loss.backward()
            scaler.step(g_descent_optimizer)
            # g_descent_optimizer.step()
            scaler.update()

            total_train_loss += loss.item()
            total_train_correct += (pred.argmax(1) == y_batch).type(torch.float).sum().item()

        ## For one epoch
        avg_train_accuracy = total_train_correct / len(train_dataloader.dataset)
        avg_train_loss = total_train_loss / len(train_dataloader)

        overall_train_accuracy.append(avg_train_accuracy)
        overall_train_losses.append(avg_train_loss)

        print("[INFO] EPOCH: {}/{}".format(e + 1, EPOCHS))
        print("Train loss: {:.6f}, Train accuracy: {:.4f}".format(avg_train_loss, avg_train_accuracy))
        print(f'{total_train_correct}/{len(train_dataloader.dataset)}')
        print("Time took for one epoch: ", (time.time() - single_epoch_st_time) / 60)

    end_time = time.time()

    total_time = end_time - start_time

    print(f'Total Training Time: {total_time / 60} Minutes.')

    torch.save(conv_lstm_model.state_dict(), './model_weights.pth')

I am currently classifying 6 sign gestures. Each video is 30 fps. My total dataset has 73 videos.
[(‘book’, 40), (‘drink’, 35), (‘computer’, 30), (‘before’, 26), (‘chair’, 26), (‘go’, 26)]
This list shows the gesture name and the num. of videos available in the dataset corresponding to that gesture.

I tried training the model with different parameters but the training accuracy for 10 epochs never increased above 10%. I also tweaked around the video dimensions, video transformations, etc. I tried without resetting the hidden states for each batch, with resetting the hidden states for each batch, and (currently) resetting the hidden states for each video in a batch.

This is the photo for the accuracies during training for a few epochs (I stopped the training midway because the accuracy keeps bouncing around the same without increasing)
Accuracies Photo

New contributor

Dheekshith Manohar is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật