I’m encountering low accuracy (7.90) when evaluating my Hugging Face DistilBERT model on the MNLI validation set. I suspect that I might not be training or preparing the model correctly. Could someone help me verify if my approach to tokenization, data loading, and evaluation is correct? Any advice or pointers would be greatly appreciated!
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
# Mapping Numeric labels for text labels for MNLI
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
# Tokenizes the MNLI dataset for the BERT model.
def prepare_data(tokenizer, dataset, max_length=512):
# Tokenizes each pair of premise and hypothesis
def tokenize_function(example):
return tokenizer(
example["premise"], # The premise in the input text
example["hypothesis"], # The hypothesis in the input text
truncation=True, # Truncate sequences longer than max_length
padding="max_length", # Pad shorter sequences to max_length
max_length=max_length # Maximum token length for each input
)
# Apply tokenization to the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Debug: prints tokenization sample
print("Sample tokenized example:", tokenized_dataset[0])
# Checks sequence lengths
for example in tokenized_dataset:
input_length = len(example['input_ids']) # Length of tokenized input sequence
# Assert ensures that no sequence exceeds the defined max length
assert input_length <= max_length, f"Input sequence exceeds max length: {input_length}"
print("All tokenized sequences are within the max length.")
return tokenized_dataset
# Computes the accuracy of predictions compared to references
def compute_accuracy(predictions, references):
correct = sum(p == r for p, r in zip(predictions, references)) # Count correct predictions
return correct / len(references) # Return accuracy as a fraction
def main():
# Loads pre-trained tokenizer and model
model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name) # Load tokenizer for the specified model
model = AutoModelForSequenceClassification.from_pretrained(model_name) # Load pre-trained sequence classification model
model.eval() # Set model to evaluation mode
# Load MNLI dataset
mnli_dataset = load_dataset("glue", "mnli") # Load GLUE MNLI dataset
test_data = mnli_dataset["validation_matched"] # Use matched validation set for evaluation
# Tokenize validation dataset
tokenized_test_data = prepare_data(tokenizer, test_data) # Tokenize the dataset
# Convert dataset to PyTorch DataLoader
def collate_fn(batch):
# Define the keys to extract for model inputs
keys = ["input_ids", "attention_mask"]
# Create tensors for inputs and labels
inputs = {key: torch.tensor([example[key] for example in batch]) for key in keys}
labels = torch.tensor([example["label"] for example in batch])
return inputs, labels
test_loader = DataLoader(
tokenized_test_data, # Pass tokenized dataset
batch_size=32, # Adjust batch size for CPU
collate_fn=collate_fn # Collate function for preparing batches
)
# Evaluate the model
predictions = [] # Store model predictions
references = [] # Store ground truth labels
for batch in tqdm(test_loader, desc="Evaluating"): # Iterate over DataLoader
inputs, labels = batch # Extract inputs and labels from batch
with torch.no_grad(): # Disable gradient computation for evaluation
outputs = model(**inputs) # Forward pass through the model
logits = outputs.logits # Extract logits from model outputs
batch_predictions = torch.argmax(logits, dim=-1).tolist() # Get predictions from logits
predictions.extend(batch_predictions) # Append predictions to the list
references.extend(labels.tolist()) # Append references to the list
# Debugging outputs
print("Sample predictions (readable):", [label_map[p] for p in predictions[:5]]) # Print first 5 predictions
print("Sample references (readable):", [label_map[r] for r in references[:5]]) # Print first 5 references
# Compute and print accuracy
accuracy = compute_accuracy(predictions, references) # Calculate accuracy
print(f"Accuracy on MNLI validation set: {accuracy * 100:.2f}%") # Print accuracy as a percentage
if __name__ == "__main__":
main() # Run the main function
New contributor
IAmNewToThis is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1