I’m implementing a Skip-Gram model for Word2Vec using Python. However, my model doesn’t seem to be working correctly, as indicated by the resulting embeddings and their visualization. Here is an example of the 3D plot of the embeddings, which shows words clustered together and overlapping, making it difficult to distinguish between them:
I suspect that the issue lies in my implementation rather than the plotting function.
import numpy as np
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re
np.random.seed(10)
def softmax(x):
'''
(xi - max{x1,x2,...,xv})
e
xi = --------------
(xj - max{x1,x2,...,xv})
∑j e
'''
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
class SkipGram:
def __init__(self,ws=2,dim=8) -> None:
self.X = []
self.N = dim
self.Y = []
self.window_size = ws
self.alpha = 0.1
self.vocab = {}
self.vocab_size = 0
def __create_vocabulary(self,corpus):
stop_words = set(stopwords.words("english"))
filtered_corpus = []
self.vocab_size = 0
for i,sentence in enumerate(corpus):
if isinstance(sentence,str) :
corpus[i] = sentence.split()
filtered_corpus.append([])
j = 0
for word in corpus[i]:
w = re.sub(r'[^a-z]+','',word.lower())
if w != '' and w not in stop_words:
corpus[i][j] = w
filtered_corpus[i].append(w)
else:
continue
if corpus[i][j].lower() not in self.vocab:
self.vocab[corpus[i][j].lower()] = self.vocab_size
self.vocab_size += 1
j += 1
return filtered_corpus
def __create_context_and_center_words(self,processed_corpus):
for sentence in processed_corpus:
for i,word in enumerate(sentence):
center_word = np.zeros((self.vocab_size,1))
center_word[self.vocab[word]][0] = 1
context = np.zeros((self.vocab_size,1))
for j in range(i-self.window_size,i + self.window_size + 1):
if j != i and j >= 0 and j < len(sentence):
context[self.vocab[sentence[j]]][0] += 1
self.X.append(center_word)
self.Y.append(context)
self.X = np.array(self.X)
self.Y = np.array(self.Y)
def initialize(self,corpus):
corpus = self.__create_vocabulary(corpus)
self.__create_context_and_center_words(corpus)
self.W1 = np.random.rand(self.vocab_size,self.N)
self.W2 = np.random.rand(self.N,self.vocab_size)
def feed_forward(self,x):
h = np.dot(self.W1.T,x) # N V . V 1 -> N 1
u = np.dot(self.W2.T,h) # V N . N 1 -> V 1
y = softmax(u)
return h,u,y
def backpropagate(self,x,y_actual,y_result,h):
e = y_result - y_actual # V 1
dw2 = np.dot(h,e.T) # N 1 . 1 V -> N V
eh = np.dot(self.W2,e) # N x V . V x 1 -> N x 1
dw1 = np.dot(x,eh.T) # V x 1 . 1 x N -> V x N
return dw1,dw2
def train(self,epochs):
for i in range(epochs):
loss = 0
dw1,dw2 = np.zeros_like(self.W1),np.zeros_like(self.W2)
for j in range(len(self.X)):
h,_,y = self.feed_forward(self.X[j])
a,b = self.backpropagate(self.X[j],self.Y[j],y,h)
dw1 += a
dw2 += b
loss -= np.sum(self.Y[j] * np.log(y+1e-08))
loss /= len(self.X)
[dw1,dw2] = [dw1/len(self.X), dw2/len(self.X)]
self.W1 -= self.alpha * dw1
self.W2 -= self.alpha * dw2
print(f'Epoch : {i+1}, Loss = {loss}')
def get_similar_words(self,word,n):
if word in self.vocab:
x = np.zeros((self.vocab_size,))
x[self.vocab[word]] = 1
_,_,y = self.feed_forward(x)
output = {}
for i in range(self.vocab_size):
output[y[i]] = i
words = {i:word for i,word in enumerate(self.vocab.keys())}
context = []
for k in sorted(output,reverse=True):
context.append(words[output[k]])
if len(context) == n:
break
return context
else:
print("Given Word not found")
def get_vector(self,word):
return self.W1[self.vocab[word]]
def plot(self):
tsne = TSNE(n_components=3,random_state=0,perplexity=self.vocab_size-1)
vectors_3d = tsne.fit_transform(self.W1)
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111,projection='3d')
ax.scatter(vectors_3d[:,0],vectors_3d[:,1],vectors_3d[:,2],marker='o',edgecolors='k')
for word,i in self.vocab.items():
ax.text(vectors_3d[i,0],vectors_3d[i,1],vectors_3d[i,2],word)
ax.set_title('Word2Vec Word Embeddings')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Dimension 3')
plt.show()
#main.py
from nltk.corpus import gutenberg
corpus = gutenberg.sents()[:40]
w2v = SkipGram(3,20)
w2v.initialize(corpus)
w2v.train(200)
w2v.plot()
I have tried adjusting the learning rate and initializing weights with different values, but the issue persists.
What might be going wrong with my implementation?
Reviewed the code for generating vocabulary and context words.
Checked the weight initialization and learning rate settings.
1