I’m attempting to prepare a model for question-answer tasks for my thesis, in the following code:
`
import os
import json
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
import matplotlib.pyplot as plt
class PlotCallback(TrainerCallback):
def __init__(self):
self.train_losses = []
self.eval_losses = []
self.eval_steps = []
def on_log(self, args, state, control, logs=None, **kwargs):
if state.is_world_process_zero:
if 'loss' in logs:
self.train_losses.append(logs['loss'])
if 'eval_loss' in logs:
self.eval_losses.append(logs['eval_loss'])
self.eval_steps.append(state.global_step)
def on_train_end(self, args, state, control, **kwargs):
# Tanulási veszteség diagram
plt.figure(figsize=(10, 6))
plt.plot(self.train_losses, label='Training Loss', color='blue')
plt.title('Training Loss over Training Steps')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()
class MyDataCollator(DataCollatorForLanguageModeling):
def __init__(self, tokenizer):
super().__init__(tokenizer, mlm=False) # mlm=False a kausalitás alapú nyelvi modellezéshez
def collate_batch(self, examples):
contexts = [example['Context'] for example in examples]
qa_lists = [example['Questions_Answers'] for example in examples]
inputs = []
labels = []
for context, qa_list in zip(contexts, qa_lists):
context_inputs = self.tokenizer(context, return_tensors="tf", padding=True, truncation=True)
# Konvertálja a context_inputs tenzorokat PyTorch tenzorokká
context_input_ids = torch.tensor(context_inputs["input_ids"].numpy(), dtype=torch.long)
context_attention_mask = torch.tensor(context_inputs["attention_mask"].numpy(), dtype=torch.long)
for qa in qa_list:
question_inputs = self.tokenizer(qa['Question'], return_tensors="tf", padding=True, truncation=True)
# Konvertálja a question_inputs tenzorokat PyTorch tenzorokká
question_input_ids = torch.tensor(question_inputs["input_ids"].numpy(), dtype=torch.long)
question_attention_mask = torch.tensor(question_inputs["attention_mask"].numpy(), dtype=torch.long)
# Az input és label adatok hozzáadása
inputs.append((context_input_ids, context_attention_mask))
labels.append((question_input_ids[0], question_attention_mask[0]))
# Továbbra is a torch.nn.utils.rnn.pad_sequence-t használjuk a padding-hez
return {
"input_ids": torch.nn.utils.rnn.pad_sequence([i[0] for i in inputs], batch_first=True),
"attention_mask": torch.nn.utils.rnn.pad_sequence([i[1] for i in inputs], batch_first=True),
"labels": torch.nn.utils.rnn.pad_sequence([l[0] for l in labels], batch_first=True),
"labels_attention_mask": torch.nn.utils.rnn.pad_sequence([l[1] for l in labels], batch_first=True)
}
with open("train.MILQA-2023-03-27.squad.s.json", "r", encoding="utf-8") as f:
data = json.load(f)
inputs = []
for paragraph in data["data"][0]["paragraphs"]:
context = paragraph["context"]
qa_list = []
for qa in paragraph["qas"]:
question = qa["question"]
short_answer = None
long_answer = None
short_start = None
short_end = None
long_start = None
long_end = None
if "answers" in qa:
if "short" in qa["answers"]:
short_answer = qa["answers"]["short"][0]["text"]
short_start = qa["answers"]["short"][0]["start"]
short_end = qa["answers"]["short"][0]["end"]
if "long" in qa["answers"]:
long_answer = qa["answers"]["long"][0]["text"]
long_start = qa["answers"]["long"][0]["start"]
long_end = qa["answers"]["long"][0]["end"]
qa_list.append({'Question': question, 'Short Answer': short_answer, 'Long Answer': long_answer})
inputs.append({'Context': context, 'Questions_Answers': qa_list})
for item in data["data"][1:]:
for paragraph in item["paragraphs"]:
context = paragraph["context"]
qa_list = []
for qa in paragraph["qas"]:
question = qa["question"]
short_answer = None
long_answer = None
short_start = None
short_end = None
long_start = None
long_end = None
if "answers" in qa:
if "short" in qa["answers"]:
short_answer = qa["answers"]["short"][0]["text"]
short_start = qa["answers"]["short"][0]["start"]
short_end = qa["answers"]["short"][0]["end"]
if "long" in qa["answers"]:
long_answer = qa["answers"]["long"][0]["text"]
long_start = qa["answers"]["long"][0]["start"]
long_end = qa["answers"]["long"][0]["end"]
qa_list.append({'Question': question, 'Short Answer': short_answer, 'Long Answer': long_answer})
inputs.append({'Context': context, 'Questions_Answers': qa_list})
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = MyDataCollator(tokenizer)
batch = data_collator.collate_batch(inputs)
output_dir = "./finetuned_model"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=2,
learning_rate=2e-4,
per_device_train_batch_size=2,
warmup_ratio=0.1,
lr_scheduler_type="linear",
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=batch, # Itt közvetlenül adjuk át a batch-et
callbacks=[PlotCallback()],
)
trainer.train()
tokenizer.save_pretrained(output_dir)
`
However, I’m encountering the following errors:
C:UsersLeventeDesktopminigpt.venvScriptspython.exe C:UsersLeventeDesktopminigpttrain.py
2024-05-01 12:57:58.680455: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-01 12:57:59.278669: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-01 12:58:04.644087: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
File "C:UsersLeventeDesktopminigpttrain.py", line 133, in <module>
batch = data_collator.collate_batch(inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:UsersLeventeDesktopminigpttrain.py", line 64, in collate_batch
"input_ids": torch.nn.utils.rnn.pad_sequence([i[0] for i in inputs], batch_first=True),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:UsersLeventeDesktopminigpt.venvLibsite-packagestorchnnutilsrnn.py", line 399, in pad_sequence
return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: The size of tensor a (349) must match the size of tensor b (327) at non-singleton dimension 1
Process finished with exit code 1
What could be the solution to the problem to make the program run successfully? Unfortunately, time is pressing, and I’m quite puzzled.
Unfortunately, I’ve tried many things, but I’ve encountered various other problems. The data processing works fine, but the issue always arises when I pass it to the trainer. There was a problem with passing a list containing dictionaries, and another problem occurred with “Scalar tensor has no len()”. What could be the solution? Could you help me fix the code?
Levente Ledenyk lev4922 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.