I’m encountering a CUDA out of memory
error when using the compute_metrics
function with the Hugging Face Trainer during model evaluation. My GPU is running out of memory while trying to compute the ROUGE scores. Below is a summary of my setup and the error message:
I have a val_dataset with 352 samples.
The model is GPT-2
I want to test the Rouge metric but when I try it gives me the error. Currently using Google Collab
-
How can I efficiently compute ROUGE or BLEU metrics without running out of GPU memory?
-
Are there any recommended strategies or configurations to handle large-scale evaluation on limited GPU memory?
The code looks like this
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)
config = AutoConfig.from_pretrained(
"gpt2",
vocab_size=len(tokenizer),
n_ctx=MAX,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config).to('cuda' if torch.cuda.is_available() else 'cpu')
mode_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {mode_size/1000**2:.1f}M parameters")
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
class EarlyStoppingCallback(TrainerCallback):
def __init__(self, patience=3):
super().__init__()
self.patience = patience
self.best_loss = np.inf
self.epochs_no_improve = 0
def on_evaluate(self, args, state, control, metrics=None, **kwargs):
eval_loss = metrics.get("eval_loss", None)
if eval_loss is not None:
if eval_loss < self.best_loss:
self.best_loss = eval_loss
self.epochs_no_improve = 0
else:
self.epochs_no_improve += 1
if self.epochs_no_improve >= self.patience:
control.should_training_stop = True
early_stopping_callback = EarlyStoppingCallback(patience=3)
training_args = TrainingArguments(
output_dir="./model",
hub_model_id="profile/model",
eval_strategy="epoch",
gradient_accumulation_steps=4,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=10,
weight_decay=0.01,
logging_dir='./logs',
save_steps=500,
save_total_limit=3,
learning_rate=1e-4,
fp16=True,
push_to_hub=True,
logging_steps=100,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_validation_dataset,
callbacks=[early_stopping_callback]
)
trainer.train()