I have the following function to compute and save metrics:
# Function to compute and save metrics
def compute_and_save_metrics(all_references, all_hypotheses, dataset_type, folder_name):
# Compute BLEU scores
bleu_score1 = corpus_bleu([[ref] for ref in all_references], all_hypotheses, weights=(1.0, 0.0, 0.0, 0.0))
bleu_score2 = corpus_bleu([[ref] for ref in all_references], all_hypotheses, weights=(0.5, 0.5))
bleu_score3 = corpus_bleu([[ref] for ref in all_references], all_hypotheses, weights=(0.33, 0.33, 0.33))
bleu_score4 = corpus_bleu([[ref] for ref in all_references], all_hypotheses, weights=(0.25, 0.25, 0.25, 0.25))
# Debugging: Print some sample references and hypotheses
print("nSample References and Hypotheses:")
for i in range(min(5, len(all_references))):
print(f"Reference {i}: {all_references[i]}")
print(f"Hypothesis {i}: {all_hypotheses[i]}n")
# Check for empty strings
empty_references = [ref for ref in all_references if not ref.strip()]
empty_hypotheses = [hyp for hyp in all_hypotheses if not hyp.strip()]
print(f"Number of empty references: {len(empty_references)}")
print(f"Number of empty hypotheses: {len(empty_hypotheses)}")
# Compute ROUGE-L and ROUGE-4 scores
rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL', 'rouge4'], use_stemmer=False)
rouge_scores = [rouge_scorer_obj.score(ref, hyp) for ref, hyp in zip(all_references, all_hypotheses)]
avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])
avg_rouge4 = np.mean([score['rouge4'].fmeasure for score in rouge_scores])
# Debugging: Print ROUGE scores for a few samples
print("nSample ROUGE Scores:")
for i in range(min(5, len(rouge_scores))):
print(f"ROUGE-L: {rouge_scores[i]['rougeL']}")
print(f"ROUGE-4: {rouge_scores[i]['rouge4']}n")
# Compute WER score
wer_score_value = wer(all_references, all_hypotheses)
Below are the results:
Validation BLEU-1 Score: 0.4071
Validation BLEU-2 Score: 0.2853
Validation BLEU-3 Score: 0.2102
Validation BLEU-4 Score: 0.1303
Validation ROUGE-L Score: 0.0000
Validation ROUGE-4 Score: 0.0000
Validation WER Score: 1.3173
There must be an error on how I calculate Rouge-L score but I can’t seem to know where. What’s wrong here?