I have some questions about generating a dissimilarity matrix of a bunch of text documents using word vectors. Here I tokenise the text, remove OOV and then sum the word vectors of each word to use as the document vectors. Then I compute the cosine distance. Is this approach correct? Some people say the vectors must be averaged, others summed? Which is correct?
def preprocess(text):
text=text.translate(str.maketrans('', '', string.punctuation))
return [word.lower() for word in text.split()]
def get_vector(document,model):
words = preprocess(document)
word_vectors = [model[word] for word in words if word in model]
if not word_vectors:
return np.zeros(model.vector_size)
return np.sum(np.array(word_vectors), axis=0)
def wordembeddingsumcos(documents,model,variant_names):
dissimilarity_matrix=np.zeros((len(documents),len(documents)))
for i in range(len(documents)):
for j in range(i+1,len(documents)):
dissimilarity_matrix[i][j]=scipy.spatial.distance.cosine(get_vector(documents[i],model),get_vector(documents[j],model))
dissimilarity_matrix[j][i]=dissimilarity_matrix[i][j]