Thiết kế website giá rẻ

Question

I tried to train the following model but I keep receiving the mentioned issue:

import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn.utils.clip_grad import clip_grad_norm_
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import os
import random

# Set the seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


# Load the datasets
# IMPORT REQUIRED DATASET
path = "/content/train_set.csv"
path_val = "/content/dev_set.csv"
path_test = "/content/test_ur.csv"

ds_train = pd.read_csv(path)
ds_val = pd.read_csv(path_val)
ds_test = pd.read_csv(path_test)
ds_train

ds_train=ds_train.dropna()
ds_val=ds_val.dropna()
ds_test=ds_test.dropna()


train_df = ds_train
val_df = ds_val
test_df = ds_test


# Create a custom dataset class
class UrduEnglishDataset(Dataset):
    def __init__(self, df, tokenizer, max_source_length, max_target_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        urdu_text = self.df.iloc[idx, 0]
        english_text = self.df.iloc[idx, 1]

        encoding = self.tokenizer.encode_plus(
            urdu_text,
            add_special_tokens=True,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        labels = self.tokenizer.encode_plus(
            english_text,
            add_special_tokens=True,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels['input_ids'].flatten(),
            'labels_attention_mask': labels['attention_mask'].flatten()
        }

# Set the model parameters
model_params = {
    "MODEL": "t5-small",
    "TRAIN_BATCH_SIZE": 2,# 4, # 8,
    "VALID_BATCH_SIZE":1, #1 2, #4,
    "TRAIN_EPOCHS": 5, ##3
    "VAL_EPOCHS": 1,
    "LEARNING_RATE": 1e-4,
    "MAX_SOURCE_TEXT_LENGTH": 64, #128, #256, # 512,
    "MAX_TARGET_TEXT_LENGTH": 64, #128, #/ 256, # 512,
    "SEED": 42,
    "GRAD_CLIP": 1.0,
    "PATIENCE": 5
}

# Set the device (TPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

# Create the datasets and data loaders
train_dataset = UrduEnglishDataset(train_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
val_dataset = UrduEnglishDataset(val_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])
test_dataset = UrduEnglishDataset(test_df, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"])

train_loader = DataLoader(train_dataset, batch_size=model_params["TRAIN_BATCH_SIZE"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=model_params["VALID_BATCH_SIZE"], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=model_params["VALID_BATCH_SIZE"], shuffle=False)

# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])

# Set the training arguments
training_args = TrainingArguments(
    output_dir='results_t5small',
    num_train_epochs=model_params["TRAIN_EPOCHS"],
    per_device_train_batch_size=model_params["TRAIN_BATCH_SIZE"],
    per_device_eval_batch_size=model_params["VALID_BATCH_SIZE"],
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logs',
    logging_steps=10,
    eval_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=5,
    report_to='tensorboard',
    learning_rate=model_params["LEARNING_RATE"],
    fp16=True
)
def compute_metrics(pred, label_ids=None):
    # Convert the label_ids to a tensor
    label_ids = label_ids if label_ids is not None else torch.tensor(pred.label_ids)

    # Convert the predictions to a tensor
    predictions = torch.tensor(pred.predictions)

    # Ensure the predictions tensor has the expected sequence length
    if predictions.size(3) != model_params["MAX_TARGET_TEXT_LENGTH"]:
        # Reshape the predictions tensor to match the expected length
        predictions = predictions.reshape(predictions.size(0), predictions.size(1), model_params["MAX_TARGET_TEXT_LENGTH"])

    # Compute the accuracy
    accuracy = torch.sum(label_ids == predictions.argmax(-1)).item()

    # Return the accuracy
    return {"accuracy": accuracy}
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics #*lambda pred: {"accuracy": torch.sum(pred.label_ids == pred.predictions.argmax(-1)).item()}
)

# Train the model
history = trainer.train()

Any suggestions, I receive the following error:/

ValueError: expected sequence of length 32128 at dim 3 (got 512)

PS: Please consider that I changed the model_params[“MAX_TARGET_TEXT_LENGTH”] = 32128 just to check and it does not work because the session stops.

……………………………………………………………………………………………………………………………………………………………………………………………………

Danh mục