I’m having trouble on fine-tuning a BERT model on a classification task, as I’m quite new to this. My data is composed of two columns, “item_title” (my input) and “meta_categ_id” (categorical output).
In particular, I get the error:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)
Here is my code (modified in part using Claude):
data = Dataset.from_pandas(df)
data = data.train_test_split(test_size=0.1)
microbert = 'BERT-...'
tokenizer = pybay.bert.AutoTokenizer.from_pretrained(microbert)
id2label = {v: k for k, v in meta_categ_map.items()}
model = pybay.bert.EBertForSequenceClassification.from_pretrained(microbert, id2label=id2label).to(device) # I should note that this model is for sentence embedding usually
# Create a label map from existing leaf category names to class indices
# including an unknown category for categories not in the training data
meta_categ_set = sorted(set([i['meta_categ_id'] for i in data['train']]))
meta_categ_map = {c: i for i, c in enumerate(meta_categ_set)}
categ_unknown = len(meta_categ_map)
meta_categ_map['UNK'] = categ_unknown
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def preprocess(features):
max_length = 128 # or whatever maximum length you want to use
# Tokenize the title with truncation and padding
tokenized = tokenizer(
features['item_title'],
truncation=True,
padding='max_length',
max_length=max_length,
return_tensors='pt'
)
result = {
'input_ids': tokenized['input_ids'].squeeze(),
'attention_mask': tokenized['attention_mask'].squeeze(),
}
# Convert the meta category name into an integer label
result['label'] = torch.tensor([meta_categ_map.get(x, categ_unknown) for x in features['meta_categ_id']])
# result = {k: v.clone().detach().to(device) for k, v in result.items()}
return result
# Apply preprocessing to all entries in the dataset
data = data.map(
preprocess,
batched=True,
remove_columns=data['train'].column_names,
num_proc=1 # Use only one process to avoid CUDA issues
)
# Set the format of the datasets to PyTorch tensors
data.set_format('torch')
# Move data to the correct device
def to_device(example):
return {k: v.to(device) for k, v in example.items()}
data['train'] = data['train'].map(to_device)
data['test'] = data['test'].map(to_device)
# this part displays "Error displaying widget: model not found" as well as numerous executor losses
# Define evaluation metrics
from transformers import EvalPrediction
import numpy as np
from sklearn.metrics import accuracy_score, top_k_accuracy_score
# This function will be called to evaluate a prediction, which contains model output and a label
def compute_metrics(eval_predictions: EvalPrediction):
# model returns logits of shape [n_samples, n_classes]
logits = eval_predictions.predictions
predictions = np.argmax(logits, axis=1)
trues = eval_predictions.label_ids
if isinstance(predictions, torch.Tensor):
predictions = predictions.cpu().numpy()
if isinstance(trues, torch.Tensor):
trues = trues.cpu().numpy()
if isinstance(logits, torch.Tensor):
logits = logits.cpu().numpy()
# We use metrics provided by scikit-learn
results = {
"Accuracy": accuracy_score(y_true=trues, y_pred=predictions),
"Top-5 accuracy": top_k_accuracy_score(y_true=trues, y_score=logits, k=5, labels=np.arange(len(leaf_categ_map)))
}
return results
import os
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
training_args = TrainingArguments(
output_dir=os.path.join(output_dir, 'model_checkpoints'), # Output model checkpoints
num_train_epochs=2, # Number of training epochs
per_device_train_batch_size=64, # Batch size for training
per_device_eval_batch_size=64, # Batch size for evaluation
learning_rate=1e-4, # Learning rate
warmup_steps=100, # Warmup for learning rate schedule
logging_steps=5000, # Logging frequency
logging_dir=os.path.join(output_dir, 'model_logs'), # Directory for logs
fp16=True, # Mixed precision training on V100 GPUs
save_strategy="epoch", # Save model checkpoint after each epoch
no_cuda=False if device == 'cuda' else True, # modification
dataloader_num_workers=4, # 4 background processes to load data faster
)
# Learn more about Trainer https://huggingface.co/transformers/main_classes/trainer.html#id1
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=data['train'],
# .map(lambda examples: {'input_ids': examples['input_ids'].to(device), 'attention_mask': examples['attention_mask'].to(device), 'label': examples['label'].to(device)}),
eval_dataset=data['test'],
# .map(lambda examples: {'input_ids': examples['input_ids'].to(device), 'attention_mask': examples['attention_mask'].to(device), 'label': examples['label'].to(device)}),
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
)
# This will start a training loop
trainer.train()
# Final evaluation on a dev set
results = trainer.evaluate(data['test'])
print(results)
# Save the final model and tokenizer to disk
trainer.save_model(os.path.join(output_dir, 'my_pretrained_model'))
Please let me know where I can fix my code. I made numerous modifications involving “to(device)” that may seem random, so forgive me. I’d appreciate any help or clarification.
Jerry Zhu is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.