Please consider the following code:
from datasets import load_dataset_builder, load_dataset
import numpy as np
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
class QGenMetrics:
def __init__(self, tokenizer, ignore_index=-100):
self.tokenizer = tokenizer
self.ignore_index = ignore_index
def clean_labels(self, labels):
labels[labels == self.ignore_index] = self.tokenizer.pad_token_id
return labels
def compute_metrics_validation(self, eval_preds):
predictions = eval_preds.predictions
labels = eval_preds.label_ids
# predictions, labels = eval_preds
#
try:
labels = self.clean_labels(labels)
predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
except Exception as e:
print(e)
print("PREDS")
print(predictions)
print("LABELS")
print(labels)
for i, pred in enumerate(predictions):
if -100 in pred:
print(f"preds[{i}]: {pred}")
assert False
res = {"metric":1.0}
return res
def compute_metrics_test(self, test_preds):
res = {"metric":1.0}
return res
#<
def actual_encoding(examples, tokenizer, max_source_len=None, max_target_len=None, ignore_label=-100):
# no padding and no truncation: a collator will do the job
# prompts_enc = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=200)
# targets_enc = tokenizer(text_target=examples["question"], padding="max_length", truncation=True, max_length=200) # max_length=target_length, padding="max_length", truncation=True, return_tensors="pt")
prompts_enc = tokenizer(examples["question"],
truncation = True if max_source_len else False,
max_length = max_source_len if max_source_len else None,
padding = "max_length" if max_source_len else False,
return_tensors="pt" if max_source_len else None)
targets_enc = tokenizer(examples["text"],
truncation = True if max_target_len else False,
padding = "max_length" if max_target_len else False,
max_length = max_target_len if max_target_len else None,
return_tensors="pt" if max_source_len else None)
print(type(prompts_enc))
examples["input_ids"] = prompts_enc["input_ids"]
examples["attention_mask"] = prompts_enc["attention_mask"]
# unused
# labels = []
# for ex_labels in targets_enc["input_ids"]:
# proc_labels = [label if label != 0 else ignore_label for label in ex_labels]
# labels.append(proc_labels)
examples["labels"] = targets_enc["input_ids"] # labels
return examples
#< actual_encoding
# download Bilkies/QuestionGeneration from HF hub
# https://huggingface.co/datasets/Bilkies/QuestionGeneration
ds_name = 'Bilkies/QuestionGeneration'
ds_builder = load_dataset_builder(ds_name)
print(ds_builder.info)
dataset = load_dataset(ds_name)
display(dataset)
train_ds = dataset['train']
print("number of original training point", len(train_ds))
# subsample train_ds
train_ds = train_ds.select(range(1000))
print("after sampling", len(train_ds))
test_ds = dataset['validation'].select(range(500))
# split training_ds in 80/20 for training and validation
train_ds = train_ds.train_test_split(test_size=0.2)
valid_ds = train_ds['test']
train_ds = train_ds['train']
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
max_source_len = 31
max_target_len = 50
train_ds = train_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len":max_source_len, "max_target_len":max_target_len}, batched=True, num_proc=2)
valid_ds = valid_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)
test_ds = test_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)
for ds in [train_ds, valid_ds, test_ds]:
ds_in_lens = [len(ex["input_ids"]) for ex in ds.iter(batch_size=1)]
ds_lab_lens = [len(ex["labels"]) for ex in ds.iter(batch_size=1)]
check = np.array([l == ds_in_lens[0] for l in ds_in_lens[1:]]).all() and np.array([l == ds_lab_lens[0] for l in ds_lab_lens[1:]]).all()
assert check, "check lengths in {ds}"
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model = model.to(torch.device("cuda:0"))
bs = 1
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest", label_pad_token_id=-100)
evaluator = QGenMetrics(tokenizer)
trainer = Seq2SeqTrainer(
model=model,
train_dataset=train_ds,
eval_dataset=valid_ds,
data_collator=data_collator,
compute_metrics=evaluator.compute_metrics_validation,
args=Seq2SeqTrainingArguments(
output_dir="./_remove",
gradient_accumulation_steps=1,
per_device_train_batch_size=per_dev_bs,
per_device_eval_batch_size=per_dev_bs,
num_train_epochs=1,
seed = 3,
data_seed = 4,
predict_with_generate=True,
eval_strategy="epoch",
report_to="none"
) #< training args
) #< trainer
trainer.train()
Which I explain here:
# download Bilkies/QuestionGeneration from HF hub
# https://huggingface.co/datasets/Bilkies/QuestionGeneration
ds_name = 'Bilkies/QuestionGeneration'
ds_builder = load_dataset_builder(ds_name)
print(ds_builder.info)
dataset = load_dataset(ds_name)
display(dataset)
train_ds = dataset['train']
print("number of original training point", len(train_ds))
# subsample train_ds
train_ds = train_ds.select(range(1000))
print("after sampling", len(train_ds))
test_ds = dataset['validation'].select(range(500))
# split training_ds in 80/20 for training and validation
train_ds = train_ds.train_test_split(test_size=0.2)
valid_ds = train_ds['test']
train_ds = train_ds['train']
After downloading the dataset Bilkies/QuestionGeneration
, I keep only the first 1000 examples for “speed” purposes. The I create three data partitions:
- the test dataset
test_ds
, that corresponds to the partitionvalid
of the HF dataset train_ds
andvalid_ds
that correspond to 80% and 20% of the training data, respectively.
model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
max_source_len = 31
max_target_len = 50
train_ds = train_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len":max_source_len, "max_target_len":max_target_len}, batched=True, num_proc=2)
valid_ds = valid_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)
test_ds = test_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)
for ds in [train_ds, valid_ds, test_ds]:
ds_in_lens = [len(ex["input_ids"]) for ex in ds.iter(batch_size=1)]
ds_lab_lens = [len(ex["labels"]) for ex in ds.iter(batch_size=1)]
check = np.array([l == ds_in_lens[0] for l in ds_in_lens[1:]]).all() and np.array([l == ds_lab_lens[0] for l in ds_lab_lens[1:]]).all()
assert check, "check lengths in {ds}"
I download a model and the related tokenizer, then tokenize the three datasets using actual_encoding
with pre-determined max lengths for input_ids
and labels
. The for
loop at the end checks that all the fields used in training have the same length.
Then I train the model, performing a validation step at the end of every training epoch. The validation is based on the generated token_id
s and not on the logits (predict_with_generate=True
)
bs = 1
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest", label_pad_token_id=-100)
evaluator = QGenMetrics(tokenizer)
trainer = Seq2SeqTrainer(
model=model,
train_dataset=train_ds,
eval_dataset=valid_ds,
data_collator=data_collator,
compute_metrics=evaluator.compute_metrics_validation,
args=Seq2SeqTrainingArguments(
output_dir="./_remove",
gradient_accumulation_steps=1,
per_device_train_batch_size=bs,
per_device_eval_batch_size=bs,
num_train_epochs=1,
seed = 3,
data_seed = 4,
predict_with_generate=True,
eval_strategy="epoch",
report_to="none"
) #< training args
) #< trainer
trainer.train()
The model is evaluated with the by the QGenMetrics
, specifically by its compute_metrics_validation
method:
def compute_metrics_validation(self, eval_preds):
predictions = eval_preds.predictions
labels = eval_preds.label_ids
try:
labels = self.clean_labels(labels)
predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
except Exception as e:
print(e)
print("PREDS")
print(predictions)
print("LABELS")
print(labels)
for i, pred in enumerate(predictions):
if -100 in pred:
print(f"preds[{i}]: {pred}")
assert False
res = {"metric":1.0}
return res
The labels are cleaned and their padding value -100 is substituted by the padding token provided by the tokenizer. Then both the predictions
and the labels
are decoded by the tokenizer.
The problem is that the tokenizer finds the value -100 in the predicted tokens:
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
[[repeated for every validation sample]]
out of range integral type conversion attempted
[...]
preds[0]: [ 0 3 9 28376 1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 -100]
[...]
preds[199]: [ 0 328 1944 3 9 467 13 3 2951 7 16 8 3 2951
7 5892 5 1 0 0 -100]
As you can see:
-
Trainer.tokenizer is used, but I cannot see where because I explicitly call the tokenizer when I need it.
-
Every validation prediction has a final -100, which should not be there.
I am sure the error is caused by a very silly step I made in my code, but I cannot see which one.
Is there anyone who can help me?
I have prepared a notebook on colab, but I am not sure it runs fine as I cannot get access to a runtime with a GPU.
https://colab.research.google.com/drive/1Ms_gcI_loWXKDPVkEtjG_W3xnsZH0YTT?usp=sharing