Thiết kế website giá rẻ

Question

I’m implementing a Skip-Gram model for Word2Vec using Python. However, my model doesn’t seem to be working correctly, as indicated by the resulting embeddings and their visualization. Here is an example of the 3D plot of the embeddings, which shows words clustered together and overlapping, making it difficult to distinguish between them:

I suspect that the issue lies in my implementation rather than the plotting function.

import numpy as np
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import re

np.random.seed(10)

def softmax(x):
    '''
                 (xi - max{x1,x2,...,xv})
                e
    xi =    --------------
                  (xj - max{x1,x2,...,xv})
             ∑j  e
    '''
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

class SkipGram:
    def __init__(self,ws=2,dim=8) -> None:
        self.X = []
        self.N = dim
        self.Y = []
        self.window_size = ws
        self.alpha = 0.1
        self.vocab = {}
        self.vocab_size = 0
        
    def __create_vocabulary(self,corpus):
        stop_words = set(stopwords.words("english"))
        filtered_corpus = []
        self.vocab_size = 0
        for i,sentence in enumerate(corpus):
            if isinstance(sentence,str) :
                corpus[i] = sentence.split()
            filtered_corpus.append([])
            j = 0
            for word in corpus[i]:
                w = re.sub(r'[^a-z]+','',word.lower())
                if w != '' and w not in stop_words:
                    corpus[i][j] = w
                    filtered_corpus[i].append(w)
                else:
                    continue
                if corpus[i][j].lower() not in self.vocab:
                    self.vocab[corpus[i][j].lower()] = self.vocab_size
                    self.vocab_size += 1
                j += 1
        return filtered_corpus
        
    def __create_context_and_center_words(self,processed_corpus):
        for sentence in processed_corpus:
            for i,word in enumerate(sentence):
                center_word = np.zeros((self.vocab_size,1))
                center_word[self.vocab[word]][0] = 1
                context = np.zeros((self.vocab_size,1))
                
                for j in range(i-self.window_size,i + self.window_size + 1):
                    if j != i and j >= 0 and j < len(sentence):
                        context[self.vocab[sentence[j]]][0] += 1
                self.X.append(center_word)
                self.Y.append(context)
        self.X = np.array(self.X)
        self.Y = np.array(self.Y)
                
        
    def initialize(self,corpus):
        corpus = self.__create_vocabulary(corpus)
        self.__create_context_and_center_words(corpus)
        self.W1 = np.random.rand(self.vocab_size,self.N)
        self.W2 = np.random.rand(self.N,self.vocab_size)
    
    def feed_forward(self,x):
        h = np.dot(self.W1.T,x) # N V . V 1 -> N 1
        u = np.dot(self.W2.T,h) # V N . N 1 -> V 1
        y = softmax(u)
        return h,u,y
    
    def backpropagate(self,x,y_actual,y_result,h):
        e = y_result - y_actual # V 1
        dw2 = np.dot(h,e.T) # N 1 . 1 V ->  N V
        eh = np.dot(self.W2,e) # N x V . V x 1 ->  N x 1
        dw1 = np.dot(x,eh.T) # V x 1 . 1 x N -> V x N
        return dw1,dw2
        
    def train(self,epochs):
        for i in range(epochs):
            loss = 0
            dw1,dw2 = np.zeros_like(self.W1),np.zeros_like(self.W2)
            for j in range(len(self.X)):
                h,_,y = self.feed_forward(self.X[j])
                a,b = self.backpropagate(self.X[j],self.Y[j],y,h)
                dw1 += a
                dw2 += b
                loss -=  np.sum(self.Y[j] * np.log(y+1e-08))
            loss /= len(self.X)
            [dw1,dw2] = [dw1/len(self.X), dw2/len(self.X)]
            self.W1 -= self.alpha * dw1
            self.W2 -= self.alpha * dw2
            print(f'Epoch : {i+1}, Loss = {loss}')
            
    def get_similar_words(self,word,n):
        if word in self.vocab:
            x = np.zeros((self.vocab_size,))
            x[self.vocab[word]] = 1
            _,_,y = self.feed_forward(x)
            output = {}
            for i in range(self.vocab_size):
                output[y[i]] = i
            words = {i:word for i,word in enumerate(self.vocab.keys())}
            context = []
            for k in sorted(output,reverse=True):
                context.append(words[output[k]])
                if len(context) == n:
                    break
            return context
        else:
            print("Given Word not found")
            
    def get_vector(self,word):
        return self.W1[self.vocab[word]]
            
    def plot(self):
        tsne = TSNE(n_components=3,random_state=0,perplexity=self.vocab_size-1)
        vectors_3d = tsne.fit_transform(self.W1)
        fig = plt.figure(figsize=(12,8))
        ax = fig.add_subplot(111,projection='3d')
        ax.scatter(vectors_3d[:,0],vectors_3d[:,1],vectors_3d[:,2],marker='o',edgecolors='k')
        for word,i in self.vocab.items():
            ax.text(vectors_3d[i,0],vectors_3d[i,1],vectors_3d[i,2],word)
        ax.set_title('Word2Vec Word Embeddings')
        ax.set_xlabel('Dimension 1')
        ax.set_ylabel('Dimension 2')
        ax.set_zlabel('Dimension 3')
        plt.show()

#main.py
from nltk.corpus import gutenberg

corpus = gutenberg.sents()[:40]
w2v = SkipGram(3,20)
w2v.initialize(corpus)
w2v.train(200)
w2v.plot()

I have tried adjusting the learning rate and initializing weights with different values, but the issue persists.

What might be going wrong with my implementation?

Reviewed the code for generating vocabulary and context words.
Checked the weight initialization and learning rate settings.

Thiết kế website giá rẻ

Danh mục

Why Is My Skip-Gram Implementation Producing Incorrect Results?