i am trying to learn on how to fine tune a pretrained model and use it. this is my code
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch
# Define a simple accuracy metric
def compute_metrics(p):
predictions, labels = p
preds = np.argmax(predictions, axis=1)
return {"accuracy": (preds == labels).mean()}
# Load the dataset
dataset = load_dataset("imdb", split='train[:1%]')
small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True)
small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# Define training arguments
training_args = TrainingArguments(
output_dir="test_trainer",
evaluation_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Evaluate the model
validation_results = trainer.evaluate()
print(validation_results)
now, i am trying to make a prediction on the fine tuned model, like this
inputs=tokenizer(dataset[0]['text'], padding="max_length", truncation=True,return_tensors="pt")
predictions = trainer.predict(test_dataset=inputs)
i am getting this error when i am trying to make a prediction,
IndexError Traceback (most recent call last) Cell In[8], line 7 3
inputs=tokenizer(dataset[0][‘text’], padding=“max_length”,
truncation=True,return_tensors=“pt”) 6 # Make predictions
—-> 7 predictions = trainer.predict(test_dataset=inputs)File C:Python311Libsite-packagestransformerstrainer.py:3305, in
Trainer.predict(self, test_dataset, ignore_keys, metric_key_prefix)
3302 start_time = time.time() 3304 eval_loop = self.prediction_loop if
self.args.use_legacy_prediction_loop else self.evaluation_loop → 3305
output = eval_loop( 3306 test_dataloader, description=“Prediction”,
ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix 3307 )
3308 total_batch_size = self.args.eval_batch_size *
self.args.world_size 3309 if
f”{metric_key_prefix}_jit_compilation_time” in output.metrics:File C:Python311Libsite-packagestransformerstrainer.py:3408, in
Trainer.evaluation_loop(self, dataloader, description,
prediction_loss_only, ignore_keys, metric_key_prefix) 3406
observed_num_examples = 0 3407 # Main evaluation loop → 3408 for step,
inputs in enumerate(dataloader): 3409 # Update the observed num
examples 3410 observed_batch_size = find_batch_size(inputs) 3411 if
observed_batch_size is not None:File C:Python311Libsite-packagesacceleratedata_loader.py:454, in
DataLoaderShard.iter(self) 452 # We iterate one batch ahead to check
when we are at the end 453 try: → 454 current_batch =
next(dataloader_iter) 455 except StopIteration: 456 yieldFile
C:Python311Libsite-packagestorchutilsdatadataloader.py:631, in
_BaseDataLoaderIter.next(self) 628 if self._sampler_iter is None: 629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub) 630 self._reset() # type: ignore[call-arg] →
631 data = self._next_data() 632 self._num_yielded += 1 633 if
self._dataset_kind == _DatasetKind.Iterable and 634
self._IterableDataset_len_called is not None and 635 self._num_yieldedself._IterableDataset_len_called:
File
C:Python311Libsite-packagestorchutilsdatadataloader.py:675, in
_SingleProcessDataLoaderIter._next_data(self) 673 def _next_data(self): 674 index = self._next_index() # may raise StopIteration → 675 data = self._dataset_fetcher.fetch(index) # may
raise StopIteration 676 if self._pin_memory: 677 data =
_utils.pin_memory.pin_memory(data, self._pin_memory_device)File
C:Python311Libsite-packagestorchutilsdata_utilsfetch.py:51, in
_MapDatasetFetcher.fetch(self, possibly_batched_index) 49 data = self.dataset.getitems(possibly_batched_index) 50 else: —> 51 data =
[self.dataset[idx] for idx in possibly_batched_index] 52 else: 53 data
= self.dataset[possibly_batched_index]File
C:Python311Libsite-packagestorchutilsdata_utilsfetch.py:51, in
(.0) 49 data = self.dataset.getitems(possibly_batched_index) 50 else:
—> 51 data = [self.dataset[idx] for idx in possibly_batched_index] 52
else: 53 data = self.dataset[possibly_batched_index]File
C:Python311Libsite-packagestransformerstokenization_utils_base.py:255,
in BatchEncoding.getitem(self, item) 253 return self.data[item] 254
elif self._encodings is not None: → 255 return self._encodings[item]
256 elif isinstance(item, slice): 257 return {key:
self.data[key][item] for key in self.data.keys()}IndexError: list index out of range