Thiết kế website giá rẻ

Question

Please consider the following code:

from datasets import load_dataset_builder, load_dataset
import numpy as np
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

class QGenMetrics:
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = ignore_index
        
    def clean_labels(self, labels):
        labels[labels == self.ignore_index] = self.tokenizer.pad_token_id
        return labels

    def compute_metrics_validation(self, eval_preds):
        predictions = eval_preds.predictions
        labels = eval_preds.label_ids

        # predictions, labels = eval_preds
        #       
        try:
            labels = self.clean_labels(labels)
            predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        except Exception as e:
            print(e)
            print("PREDS")
            print(predictions)
            print("LABELS")
            print(labels)
            for i, pred in enumerate(predictions):
                if -100 in pred:
                    print(f"preds[{i}]: {pred}")
            assert False

        res = {"metric":1.0}
        return res

    def compute_metrics_test(self, test_preds):
        res = {"metric":1.0}
        return res        
#<

def actual_encoding(examples, tokenizer, max_source_len=None, max_target_len=None, ignore_label=-100):
    # no padding and no truncation: a collator will do the job
    # prompts_enc = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=200)
    # targets_enc = tokenizer(text_target=examples["question"], padding="max_length", truncation=True, max_length=200)  # max_length=target_length, padding="max_length", truncation=True, return_tensors="pt")
    prompts_enc = tokenizer(examples["question"],
                            truncation = True if max_source_len else False,
                            max_length = max_source_len if max_source_len else None,
                            padding = "max_length" if max_source_len else False,
                            return_tensors="pt" if max_source_len else None)
    targets_enc = tokenizer(examples["text"],
                            truncation = True if max_target_len else False,
                            padding = "max_length" if max_target_len else False,
                            max_length = max_target_len if max_target_len else None,
                            return_tensors="pt" if max_source_len else None)

    print(type(prompts_enc))
    examples["input_ids"] = prompts_enc["input_ids"]
    examples["attention_mask"] = prompts_enc["attention_mask"]

    # unused    
    # labels = []
    # for ex_labels in targets_enc["input_ids"]:
    #     proc_labels = [label if label != 0 else ignore_label for label in ex_labels]
    #     labels.append(proc_labels) 
    examples["labels"] = targets_enc["input_ids"]  # labels
    return examples        
#< actual_encoding




# download Bilkies/QuestionGeneration from HF hub
# https://huggingface.co/datasets/Bilkies/QuestionGeneration

ds_name = 'Bilkies/QuestionGeneration'
ds_builder = load_dataset_builder(ds_name)
print(ds_builder.info)
dataset = load_dataset(ds_name)
display(dataset)
train_ds = dataset['train']
print("number of original training point", len(train_ds))
# subsample train_ds
train_ds = train_ds.select(range(1000))
print("after sampling", len(train_ds))
test_ds = dataset['validation'].select(range(500))
# split training_ds in 80/20 for training and validation
train_ds = train_ds.train_test_split(test_size=0.2)
valid_ds = train_ds['test']
train_ds = train_ds['train']


model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)

max_source_len = 31
max_target_len = 50

train_ds = train_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len":max_source_len, "max_target_len":max_target_len}, batched=True, num_proc=2)

valid_ds = valid_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)

test_ds = test_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)

for ds in [train_ds, valid_ds, test_ds]:
    ds_in_lens = [len(ex["input_ids"]) for ex in ds.iter(batch_size=1)]
    ds_lab_lens = [len(ex["labels"]) for ex in ds.iter(batch_size=1)]
    check = np.array([l == ds_in_lens[0] for l in ds_in_lens[1:]]).all() and np.array([l == ds_lab_lens[0] for l in ds_lab_lens[1:]]).all()
    assert check, "check lengths in {ds}"



model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
model = model.to(torch.device("cuda:0"))

bs = 1

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest", label_pad_token_id=-100)
evaluator = QGenMetrics(tokenizer)
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=data_collator,
    compute_metrics=evaluator.compute_metrics_validation,
    args=Seq2SeqTrainingArguments(
        output_dir="./_remove",
        gradient_accumulation_steps=1,
        per_device_train_batch_size=per_dev_bs,
        per_device_eval_batch_size=per_dev_bs,
        num_train_epochs=1,
        seed = 3,
        data_seed = 4,
        predict_with_generate=True,
        eval_strategy="epoch",
        report_to="none"
    ) #< training args
) #< trainer

trainer.train()

Which I explain here:

# download Bilkies/QuestionGeneration from HF hub
# https://huggingface.co/datasets/Bilkies/QuestionGeneration

ds_name = 'Bilkies/QuestionGeneration'
ds_builder = load_dataset_builder(ds_name)
print(ds_builder.info)
dataset = load_dataset(ds_name)
display(dataset)
train_ds = dataset['train']
print("number of original training point", len(train_ds))
# subsample train_ds
train_ds = train_ds.select(range(1000))
print("after sampling", len(train_ds))
test_ds = dataset['validation'].select(range(500))
# split training_ds in 80/20 for training and validation
train_ds = train_ds.train_test_split(test_size=0.2)
valid_ds = train_ds['test']
train_ds = train_ds['train']

After downloading the dataset Bilkies/QuestionGeneration, I keep only the first 1000 examples for “speed” purposes. The I create three data partitions:

the test dataset test_ds, that corresponds to the partition valid of the HF dataset
train_ds and valid_ds that correspond to 80% and 20% of the training data, respectively.

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)

max_source_len = 31
max_target_len = 50

train_ds = train_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len":max_source_len, "max_target_len":max_target_len}, batched=True, num_proc=2)

valid_ds = valid_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)

test_ds = test_ds.map(actual_encoding, fn_kwargs={"tokenizer": tokenizer, "max_source_len": max_source_len, "max_target_len": max_target_len}, batched=True, num_proc=2)

for ds in [train_ds, valid_ds, test_ds]:
    ds_in_lens = [len(ex["input_ids"]) for ex in ds.iter(batch_size=1)]
    ds_lab_lens = [len(ex["labels"]) for ex in ds.iter(batch_size=1)]
    check = np.array([l == ds_in_lens[0] for l in ds_in_lens[1:]]).all() and np.array([l == ds_lab_lens[0] for l in ds_lab_lens[1:]]).all()
    assert check, "check lengths in {ds}"

I download a model and the related tokenizer, then tokenize the three datasets using actual_encoding with pre-determined max lengths for input_ids and labels. The for loop at the end checks that all the fields used in training have the same length.

Then I train the model, performing a validation step at the end of every training epoch. The validation is based on the generated token_ids and not on the logits (predict_with_generate=True)

bs = 1

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest", label_pad_token_id=-100)
evaluator = QGenMetrics(tokenizer)
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    data_collator=data_collator,
    compute_metrics=evaluator.compute_metrics_validation,
    args=Seq2SeqTrainingArguments(
        output_dir="./_remove",
        gradient_accumulation_steps=1,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        num_train_epochs=1,
        seed = 3,
        data_seed = 4,
        predict_with_generate=True,
        eval_strategy="epoch",
        report_to="none"
    ) #< training args
) #< trainer

trainer.train()

The model is evaluated with the by the QGenMetrics, specifically by its compute_metrics_validation method:

def compute_metrics_validation(self, eval_preds):
        predictions = eval_preds.predictions
        labels = eval_preds.label_ids

            
        try:
            labels = self.clean_labels(labels)
            predictions = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
            labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        except Exception as e:
            print(e)
            print("PREDS")
            print(predictions)
            print("LABELS")
            print(labels)
            for i, pred in enumerate(predictions):
                if -100 in pred:
                    print(f"preds[{i}]: {pred}")
            assert False

        res = {"metric":1.0}
        return res

The labels are cleaned and their padding value -100 is substituted by the padding token provided by the tokenizer. Then both the predictions and the labels are decoded by the tokenizer.

The problem is that the tokenizer finds the value -100 in the predicted tokens:

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  [[repeated for every validation sample]]

out of range integral type conversion attempted

[...]
preds[0]: [    0     3     9 28376     1     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0  -100]
[...]
preds[199]: [   0  328 1944    3    9  467   13    3 2951    7   16    8    3 2951
    7 5892    5    1    0    0 -100]

As you can see:

Trainer.tokenizer is used, but I cannot see where because I explicitly call the tokenizer when I need it.
Every validation prediction has a final -100, which should not be there.

I am sure the error is caused by a very silly step I made in my code, but I cannot see which one.

Is there anyone who can help me?

I have prepared a notebook on colab, but I am not sure it runs fine as I cannot get access to a runtime with a GPU.

https://colab.research.google.com/drive/1Ms_gcI_loWXKDPVkEtjG_W3xnsZH0YTT?usp=sharing

Thiết kế website giá rẻ

Danh mục

Wrong padding tokens in HF model prediction