Here is the veryyy simple data set that I am using:
Nice Green 1.0 (cosine similarity value)
Orange Green 0.0 (same)
Here is the script that I am using:
import os
from datasets import Dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import TripletLoss
from sklearn.metrics.pairwise import cosine_similarity
# Path to the local file
file_path = "train_data.txt"
# Read the file and process the data
triplets = []
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
parts = line.strip().rsplit(' ', 2) # Split by spaces from the right to capture last 3 parts as score and id
if len(parts) == 3:
sentence1, sentence2, score = parts
score = float(score)
if score == 1.0:
triplets.append({"anchor": sentence1, "positive": sentence2, "negative": sentence1}) # Dummy negative to be replaced
elif score == 0.0:
triplets.append({"anchor": sentence1, "positive": sentence1, "negative": sentence2}) # Dummy positive to be replaced
else:
print(f"Skipping line due to incorrect format: {line.strip()}")
# Create triplets by matching positive and negative examples
final_triplets = []
for triplet in triplets:
if triplet["negative"] == triplet["anchor"]: # It's a positive pair
# Find a negative example
for t in triplets:
if t["negative"] != triplet["anchor"] and t["positive"] != triplet["anchor"]:
final_triplets.append({
"anchor": triplet["anchor"],
"positive": triplet["positive"],
"negative": t["negative"]
})
break
elif triplet["positive"] == triplet["anchor"]: # It's a negative pair
# Find a positive example
for t in triplets:
if t["positive"] != triplet["anchor"] and t["negative"] != triplet["anchor"]:
final_triplets.append({
"anchor": triplet["anchor"],
"positive": t["positive"],
"negative": triplet["negative"]
})
break
# Ensure there is data to train on
if not final_triplets:
raise ValueError("No valid triplets found in train_data.txt.")
# Create a Dataset object
train_dataset = Dataset.from_dict({
"anchor": [triplet["anchor"] for triplet in final_triplets],
"positive": [triplet["positive"] for triplet in final_triplets],
"negative": [triplet["negative"] for triplet in final_triplets]
})
# Load a pre-trained model from the local directory to fine-tune
pretrained_model_path = "./sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(pretrained_model_path)
# Define a loss function
loss = TripletLoss(model=model)
# Specify training arguments
args = SentenceTransformerTrainingArguments(
output_dir="models/minilm-l6-v2-custom-triplets", # Checkpoints and other outputs during training will be saved here
num_train_epochs=20, # Increase the number of epochs for better training
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=2e-5,
warmup_ratio=0.1,
fp16=True,
eval_strategy="steps",
eval_steps=100,
save_strategy="steps",
save_steps=100,
save_total_limit=2,
logging_steps=100,
run_name="minilm-l6-v2-custom-triplets"
)
# Create a trainer and train
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
loss=loss
)
trainer.train()
# Save the trained model
save_directory = os.path.join(os.getcwd(), "fine-tuned-model")
model.save_pretrained(save_directory)
# Load the trained model
model = SentenceTransformer(save_directory)
# Encode sentences
sentences = [
"2|~|Nice",
"1|~|Green",
"3|~|Orange"
]
embeddings = model.encode(sentences)
# Calculate cosine similarity
cos_sim_1_2 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
cos_sim_1_3 = cosine_similarity([embeddings[0]], [embeddings[2]])[0][0]
print(f"Similarity between '2|~| Nice' and '1|~| Green': {cos_sim_1_2}")
print(f"Similarity between '2|~| Orange' and '3|~|Green': {cos_sim_1_3}")
Here are my results:
Similarity between ‘2|~| Nice’ and ‘1|~| Green’: 0.617717444896698
Similarity between ‘2|~| Orange’ and ‘3|~|Green’: 0.6824002265930176
The similarity between orange and green is way too close and between nice and green is not close enough despite the fact that the model has been trained on a dataset where nice and green are synonymous and orange and green are opposites.
I have tried a different loss function other than triplet loss (namely MultipleNegativesRankingLoss), but that produced similar results also. How do I get the similarity scores to reflect the training data???
CelticsBanner18 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.