Thiết kế website giá rẻ

Question

I’m encountering low accuracy (7.90) when evaluating my Hugging Face DistilBERT model on the MNLI validation set. I suspect that I might not be training or preparing the model correctly. Could someone help me verify if my approach to tokenization, data loading, and evaluation is correct? Any advice or pointers would be greatly appreciated!

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Mapping Numeric labels for text labels for MNLI
label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

# Tokenizes the MNLI dataset for the BERT model.
def prepare_data(tokenizer, dataset, max_length=512):
    # Tokenizes each pair of premise and hypothesis
    def tokenize_function(example):
        return tokenizer(
            example["premise"],  # The premise in the input text
            example["hypothesis"],  # The hypothesis in the input text
            truncation=True,  # Truncate sequences longer than max_length
            padding="max_length",  # Pad shorter sequences to max_length
            max_length=max_length  # Maximum token length for each input
        )
    
    # Apply tokenization to the entire dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    # Debug: prints tokenization sample
    print("Sample tokenized example:", tokenized_dataset[0])
    
    # Checks sequence lengths
    for example in tokenized_dataset:
        input_length = len(example['input_ids'])  # Length of tokenized input sequence
        # Assert ensures that no sequence exceeds the defined max length
        assert input_length <= max_length, f"Input sequence exceeds max length: {input_length}"
    print("All tokenized sequences are within the max length.")
    return tokenized_dataset

# Computes the accuracy of predictions compared to references
def compute_accuracy(predictions, references):
    correct = sum(p == r for p, r in zip(predictions, references))  # Count correct predictions
    return correct / len(references)  # Return accuracy as a fraction

def main():
    # Loads pre-trained tokenizer and model
    model_name = "huggingface/distilbert-base-uncased-finetuned-mnli"
    tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer for the specified model
    model = AutoModelForSequenceClassification.from_pretrained(model_name)  # Load pre-trained sequence classification model
    model.eval()  # Set model to evaluation mode

    # Load MNLI dataset
    mnli_dataset = load_dataset("glue", "mnli")  # Load GLUE MNLI dataset
    test_data = mnli_dataset["validation_matched"]  # Use matched validation set for evaluation

    # Tokenize validation dataset
    tokenized_test_data = prepare_data(tokenizer, test_data)  # Tokenize the dataset

    # Convert dataset to PyTorch DataLoader
    def collate_fn(batch):
        # Define the keys to extract for model inputs
        keys = ["input_ids", "attention_mask"]
        # Create tensors for inputs and labels
        inputs = {key: torch.tensor([example[key] for example in batch]) for key in keys}
        labels = torch.tensor([example["label"] for example in batch])
        return inputs, labels

    test_loader = DataLoader(
        tokenized_test_data,  # Pass tokenized dataset
        batch_size=32,  # Adjust batch size for CPU
        collate_fn=collate_fn  # Collate function for preparing batches
    )

    # Evaluate the model
    predictions = []  # Store model predictions
    references = []  # Store ground truth labels
    for batch in tqdm(test_loader, desc="Evaluating"):  # Iterate over DataLoader
        inputs, labels = batch  # Extract inputs and labels from batch
        with torch.no_grad():  # Disable gradient computation for evaluation
            outputs = model(**inputs)  # Forward pass through the model
            logits = outputs.logits  # Extract logits from model outputs
            batch_predictions = torch.argmax(logits, dim=-1).tolist()  # Get predictions from logits
            predictions.extend(batch_predictions)  # Append predictions to the list
            references.extend(labels.tolist())  # Append references to the list

    # Debugging outputs
    print("Sample predictions (readable):", [label_map[p] for p in predictions[:5]])  # Print first 5 predictions
    print("Sample references (readable):", [label_map[r] for r in references[:5]])  # Print first 5 references
    
    # Compute and print accuracy
    accuracy = compute_accuracy(predictions, references)  # Calculate accuracy
    print(f"Accuracy on MNLI validation set: {accuracy * 100:.2f}%")  # Print accuracy as a percentage

if __name__ == "__main__":
    main()  # Run the main function

Thiết kế website giá rẻ

Danh mục

Low Accuracy on MNLI Validation Set Using Hugging Face DistilBERT Model