Context – Doing an NLP project to analyze comments column in a data frame. I want to replace the duplicates with the first occurrence if the meaning of the comments are same.
I wants to compare all the sentences and consider the sentences which have similar meaning.
I have tried transformer for summarization and cosine to find similarity between sentences. but it is not giving me the desired results.
Any help will be deeply appreciated.
code is here –
#functions to lemmatize and remove punctuation
def remove_punctuation(text):
text = text.lower()
text=text.strip()
text = text.translate(str.maketrans('','',string.punctuation))
text = sent_tokenize(text)
text_list=[t.strip() for t in text]
return text_list
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
words = word_tokenize(text.lower())
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
return " ".join(lemmatized_words)
model=SentenceTransformer('paraphrase-MiniLM-L6-v2')
#this will have unique values of other comments column. values after summarizations will be replaced in other comments column to bring uniformity
Other_comments_unique=df1["DUPLICATE - Other comments"].apply(lambda x:x.strip().lower().translate(str.maketrans('','',string.punctuation))).unique()
df1['DUPLICATE - Other comments']=df1['DUPLICATE - Other comments'].apply(remove_punctuation)
df1['DUPLICATE - Other comments'].head()
#summarization - meaning extracting the core message of the sentence. done in this cell
changed_values={}
for i in range(0,len(df1)):
items = [n for n in df1['test'][i]]
for u in Other_comments_unique:
if u!="nothing more":
for a in items:
embeddings1=model.encode(u)
embeddings2=model.encode(a)
cosine_sim = util.cos_sim(embeddings1,embeddings2)
if cosine_sim.item()>0.5 and cosine_sim.item()<.99:
changed_values.update({u:a})
items=[]