from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
def train_one_epoch(model, dataloader, optimizer):
model.train()
loss_list = []
for batch in tqdm(dataloader):
batch_data = {
'input_ids': batch['input_ids'],
'attention_mask': batch['attention_mask'],
'labels': batch['labels']
}
loss = model(**batch_data).loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
loss_list.append(loss.detach().item())
avg_loss = sum(loss_list) / len(loss_list)
print('avg loss in epoch:', avg_loss)
def evaluate(model, dataloader):
model.eval()
all_labels = []
all_predictions = []
for batch in dataloader:
with torch.no_grad():
batch_data = {
'input_ids': batch['input_ids'],
'attention_mask': batch['attention_mask']
}
logits = model(**batch_data).logits
predictions = torch.argmax(logits, dim=-1)
labels = batch['labels']
all_labels.extend(labels)
all_predictions.extend(predictions)
accuracy = compute_accuracy(all_predictions, all_labels)
print("Accuracy", accuracy)
return accuracy
def compute_accuracy(predictions, labels):
correct = 0
for pred, label in zip(predictions, labels):
if pred == label:
correct += 1
return correct / len(labels)
def my_collate_fn(batched_samples):
texts = [example['text'] for example in batched_samples]
labels = [example['label'] for example in batched_samples]
text_encoding = tokenizer(texts, max_length=128, truncation=True, padding=True, return_tensors='pt')
labels = torch.LongTensor(labels)
return {
'input_ids': text_encoding['input_ids'].cuda(),
'attention_mask': text_encoding['attention_mask'].cuda(),
'labels': labels.cuda()
}
torch.manual_seed(64)
batch_size = 16
learning_rate = 5e-5
num_epochs = 10
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.cuda()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate, eps=1e-8)
datasets = load_dataset("gpt3mix/sst2")
train_dataloader = DataLoader(
datasets['train'],
batch_size=8,
shuffle=True,
collate_fn=my_collate_fn,
num_workers=0
)
validation_dataloader = DataLoader(
datasets['validation'],
batch_size=8,
shuffle=False,
collate_fn=my_collate_fn,
num_workers=0
)
best_acc = 0.0
for epoch in range(1, num_epochs + 1):
train_one_epoch(model, train_dataloader, optimizer)
valid_acc = evaluate(model, validation_dataloader)
100%|██████████| 865/865 [01:27<00:00, 9.89it/s]
avg loss in epoch: 0.6746856869559068
Accuracy 0.4908256880733945
100%|██████████| 865/865 [01:25<00:00, 10.09it/s]
avg loss in epoch: 0.6922555248516833
Accuracy 0.4908256880733945
100%|██████████| 865/865 [01:27<00:00, 9.89it/s]
avg loss in epoch: 0.6976809655310791
Accuracy 0.5091743119266054
Changing learning rate also not works
You are not specifying the number of possible labels for your sequence classification model, which means you only allow the model to predict the same class for every single data point.
Just modify the part where you load the model to:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)