I tried to train the following model but I keep receiving the mentioned issue:
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils.clip_grad import clip_grad_norm_
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import os
import random
# Set the seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# Load the datasets
# IMPORT REQUIRED DATASET
path = "/content/train_set.csv"
path_val = "/content/dev_set.csv"
path_test = "/content/test_ur.csv"
ds_train = pd.read_csv(path)
ds_val = pd.read_csv(path_val)
ds_test = pd.read_csv(path_test)
ds_train
ds_train=ds_train.dropna()
ds_val=ds_val.dropna()
ds_test=ds_test.dropna()
train_df = ds_train
val_df = ds_val
test_df = ds_test
# Create a custom dataset class
class UrduEnglishDataset(Dataset):
def __init__(self, df, tokenizer, max_source_length, max_target_length):
self.df = df
self.tokenizer = tokenizer
self.max_source_length = max_source_length
self.max_target_length = max_target_length
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
urdu_text = self.df.iloc[idx, 0]
english_text = self.df.iloc[idx, 1]
encoding = self.tokenizer.encode_plus(
urdu_text,
add_special_tokens=True,
max_length=self.max_source_length,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
labels = self.tokenizer.encode_plus(
english_text,
add_special_tokens=True,
max_length=self.max_target_length,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'labels': labels['input_ids'].flatten(),
'labels_attention_mask': labels['attention_mask'].flatten()
}
# Set the model parameters
model_params = {
"MODEL": "t5-small",
"TRAIN_BATCH_SIZE": 2,# 4, # 8,
"VALID_BATCH_SIZE":1, #1 2, #4,
"TRAIN_EPOCHS": 5, ##3
"VAL_EPOCHS": 1,
"LEARNING_RATE": 1e-4,
"MAX_SOURCE_TEXT_LENGTH": 64, #128, #256, # 512,
"MAX_TARGET_TEXT_LENGTH": 64, #128, #/ 256, # 512,
"SEED": 42,
"GRAD_CLIP": 1.0,
"PATIENCE": 5
}
# Set the device (TPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
# Create the datasets and data loaders
train_dataset = UrduEnglishDataset(train_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
val_dataset = UrduEnglishDataset(val_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
test_dataset = UrduEnglishDataset(test_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
train_loader = DataLoader(train_dataset, batch_size=model_params["TRAIN_BATCH_SIZE"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=model_params["VALID_BATCH_SIZE"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=model_params["VALID_BATCH_SIZE"], shuffle=False)
# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
# Set the training arguments
training_args = TrainingArguments(
output_dir='results_t5small',
num_train_epochs=model_params["TRAIN_EPOCHS"],
per_device_train_batch_size=model_params["TRAIN_BATCH_SIZE"],
per_device_eval_batch_size=model_params["VALID_BATCH_SIZE"],
warmup_steps=500,
weight_decay=0.01,
logging_dir='logs',
logging_steps=10,
eval_strategy='steps',
save_steps=500,
eval_steps=500,
load_best_model_at_end=True,
save_total_limit=5,
report_to='tensorboard',
learning_rate=model_params["LEARNING_RATE"],
fp16=True
)
def compute_metrics(pred, label_ids=None):
# Convert the label_ids to a tensor
label_ids = label_ids if label_ids is not None else torch.tensor(pred.label_ids)
# Convert the predictions to a tensor
predictions = torch.tensor(pred.predictions)
# Ensure the predictions tensor has the expected sequence length
if predictions.size(3) != model_params["MAX_TARGET_TEXT_LENGTH"]:
# Reshape the predictions tensor to match the expected length
predictions = predictions.reshape(predictions.size(0), predictions.size(1), model_params["MAX_TARGET_TEXT_LENGTH"])
# Compute the accuracy
accuracy = torch.sum(label_ids == predictions.argmax(-1)).item()
# Return the accuracy
return {"accuracy": accuracy}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics #*lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()}
)
# Train the model
history = trainer.train()
Any suggestions, I receive the following error:/
ValueError: expected sequence of length 32128 at dim 3 (got 512)
PS: Please consider that I changed the model_params[“MAX_TARGET_TEXT_LENGTH”] = 32128 just to check and it does not work because the session stops.
……………………………………………………………………………………………………………………………………………………………………………………………………