Thiết kế website giá rẻ

Question

I want to implement binary text classification with PyTorch and Doc2Vec. I’ve vectorized my data and everything seemed fine so far. Next, I implemented my neural network and tried to train it but it seemed to not learn at all. It classifies everything as class 0 which is the majority class (imbalanced dataset) so the resulting accuracy is high but this is not the behavior I want to achieve.

Here’s my code.

Loading and cleaning the dataset (normal tweets and tweets that include cyberbullying):

<code>from datasets import load_dataset

dataset = load_dataset('poleval2019_cyberbullying', 'task01')

print(dataset)

train_dataset = dataset["train"]

test_dataset = dataset["test"]

df_train = pd.DataFrame(train_dataset)

df_test = pd.DataFrame(test_dataset)

nlp = spacy.load('pl_core_news_sm')

def remove_mentions(text):

text = re.sub(r"(?:@|https?://)S+", "", text)

return text

def replace_polish_chars(text):

replacements = {

'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l',

'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z',

'ż': 'z'

}

for polish_char, latin_char in replacements.items():

text = text.replace(polish_char, latin_char)

return text

def clear(text):

text = re.sub(r'\', '', text)

text = re.sub(r'"', '', text)

text = re.sub(r'#S+', '', text)

text = re.sub(r'[?!]+', '', text)

doc = nlp(text)

clean_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_digit and len(token.text) > 2]

clean_text = ' '.join(clean_tokens)

return clean_text

def clean_tweet(tweet):

tweet = remove_mentions(tweet)

tweet = clear(tweet)

tweet = replace_polish_chars(tweet)

return tweet

df_train['text'] = [clean_tweet(tweet) for tweet in df_train['text']]

df_test['text'] = [clean_tweet(tweet) for tweet in df_test['text']]

X_train = df_train['text'].to_numpy()

X_test = df_test['text'].to_numpy()

y_train = df_train['label'].to_numpy()

y_test = df_test['label'].to_numpy()

</code>

<code>from datasets import load_dataset dataset = load_dataset('poleval2019_cyberbullying', 'task01') print(dataset) train_dataset = dataset["train"] test_dataset = dataset["test"] df_train = pd.DataFrame(train_dataset) df_test = pd.DataFrame(test_dataset) nlp = spacy.load('pl_core_news_sm') def remove_mentions(text): text = re.sub(r"(?:@|https?://)S+", "", text) return text def replace_polish_chars(text): replacements = { 'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z' } for polish_char, latin_char in replacements.items(): text = text.replace(polish_char, latin_char) return text def clear(text): text = re.sub(r'\', '', text) text = re.sub(r'"', '', text) text = re.sub(r'#S+', '', text) text = re.sub(r'[?!]+', '', text) doc = nlp(text) clean_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_digit and len(token.text) > 2] clean_text = ' '.join(clean_tokens) return clean_text def clean_tweet(tweet): tweet = remove_mentions(tweet) tweet = clear(tweet) tweet = replace_polish_chars(tweet) return tweet df_train['text'] = [clean_tweet(tweet) for tweet in df_train['text']] df_test['text'] = [clean_tweet(tweet) for tweet in df_test['text']] X_train = df_train['text'].to_numpy() X_test = df_test['text'].to_numpy() y_train = df_train['label'].to_numpy() y_test = df_test['label'].to_numpy() </code>

from datasets import load_dataset

dataset = load_dataset('poleval2019_cyberbullying', 'task01')
print(dataset)

train_dataset = dataset["train"]
test_dataset = dataset["test"]
df_train = pd.DataFrame(train_dataset)
df_test = pd.DataFrame(test_dataset)

nlp = spacy.load('pl_core_news_sm')

def remove_mentions(text):
    text = re.sub(r"(?:@|https?://)S+", "", text)
    return text

def replace_polish_chars(text):
    replacements = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l',
        'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z',
        'ż': 'z'
    }
    for polish_char, latin_char in replacements.items():
        text = text.replace(polish_char, latin_char)
    return text


def clear(text):
    text = re.sub(r'\', '', text)
    text = re.sub(r'"', '', text)

    text = re.sub(r'#S+', '', text)
    text = re.sub(r'[?!]+', '', text)

    doc = nlp(text)


    clean_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_digit and len(token.text) > 2]
    clean_text = ' '.join(clean_tokens)

    return clean_text

def clean_tweet(tweet):
    tweet = remove_mentions(tweet)
    tweet = clear(tweet)
    tweet = replace_polish_chars(tweet)
    return tweet

df_train['text'] = [clean_tweet(tweet) for tweet in df_train['text']]
df_test['text'] = [clean_tweet(tweet) for tweet in df_test['text']]

X_train = df_train['text'].to_numpy()
X_test = df_test['text'].to_numpy()
y_train = df_train['label'].to_numpy()
y_test = df_test['label'].to_numpy()

Vectorizing the data:

<code>from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Doc2Vec

import gensim

import numpy as np

tokenized_train = [row.split() for row in X_train]

tokenized_test = [row.split() for row in X_test]

tagged_train = []

tagged_test = []

for i, tweet in enumerate(tokenized_train):

tagged_train.append(gensim.models.doc2vec.TaggedDocument(tweet, [i]))

vectorizer = Doc2Vec(tagged_train, vector_size=10000, window=2, min_count=1)

vector = vectorizer.infer_vector(["system", "response"])

X_train = np.array([vectorizer.infer_vector(doc) for doc in tokenized_train])

X_test = np.array([vectorizer.infer_vector(doc) for doc in tokenized_test])

</code>

<code>from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models import Doc2Vec import gensim import numpy as np tokenized_train = [row.split() for row in X_train] tokenized_test = [row.split() for row in X_test] tagged_train = [] tagged_test = [] for i, tweet in enumerate(tokenized_train): tagged_train.append(gensim.models.doc2vec.TaggedDocument(tweet, [i])) vectorizer = Doc2Vec(tagged_train, vector_size=10000, window=2, min_count=1) vector = vectorizer.infer_vector(["system", "response"]) X_train = np.array([vectorizer.infer_vector(doc) for doc in tokenized_train]) X_test = np.array([vectorizer.infer_vector(doc) for doc in tokenized_test]) </code>

from  sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Doc2Vec
import gensim
import numpy as np

tokenized_train = [row.split() for row in X_train]
tokenized_test = [row.split() for row in X_test]
tagged_train = []
tagged_test = []
for i, tweet in enumerate(tokenized_train):
    tagged_train.append(gensim.models.doc2vec.TaggedDocument(tweet, [i]))
vectorizer = Doc2Vec(tagged_train, vector_size=10000, window=2, min_count=1)
vector = vectorizer.infer_vector(["system", "response"])
X_train = np.array([vectorizer.infer_vector(doc) for doc in tokenized_train])
X_test = np.array([vectorizer.infer_vector(doc) for doc in tokenized_test])

And now defining the neural network:

<code>device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def processDataNN(X_train, y_train, X_test, y_test, batch_size):

min_vals = np.min(X_train, axis=0)

max_vals = np.max(X_train, axis=0)

print(X_train)

# Normalize the data

X_train = (X_train - min_vals) / (max_vals - min_vals)

X_test = (X_test - min_vals) / (max_vals - min_vals)

tensor_X_train = torch.Tensor(X_train)

tensor_y_train = torch.Tensor(y_train)

tensor_X_test = torch.Tensor(X_test)

tensor_y_test = torch.Tensor(y_test)

train_dataset = TensorDataset(tensor_X_train, tensor_y_train)

train_dataloader = DataLoader(train_dataset,batch_size = batch_size, shuffle = True)

test_dataset = TensorDataset(tensor_X_test,tensor_y_test)

test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

return (train_dataloader, test_dataloader)

class TextClassifierNeuralNetwork(nn.Module):

def __init__(self, vector_len, num_classes=1):

super(TextClassifierNeuralNetwork, self).__init__()

self.layer1 = nn.Sequential(nn.Linear(vector_len, 128),

nn.Dropout(0.5),

nn.ReLU(inplace=True))

self.layer2 = nn.Sequential(nn.Linear(128, 64),

nn.Dropout(0.5),

nn.ReLU(inplace=True))

self.layer3 = nn.Sequential(nn.Linear(64, 1),

nn.Sigmoid())

def forward(self, x):

out = self.layer1(x)

out = self.layer2(out)

out = self.layer3(out)

return out

def classifier(X_train, y_train, X_test, y_test, type):

n_epochs = 10

batch_size = 8

train_loader, test_loader = processDataNN(X_train, y_train, X_test, y_test, batch_size)

number, dim = X_train.shape

model = TextClassifierNeuralNetwork(dim).to(device)

criterion = nn.BCELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(n_epochs):

model.train()

loss = 0.0

correct = 0

total = 0

for i_batch, (inputs, labels) in enumerate (train_loader):

inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device)

optimizer.zero_grad()

outputs = model(inputs)

outputs = outputs.to(torch.float)

loss = criterion(outputs, labels)

loss.backward()

optimizer.step()

predicted = (outputs > 0.5).float()

total += labels.size(0)

correct += (predicted == labels).sum().item()

loss += loss.item() * len(inputs)

avg_loss = loss / total

print("| Epoch {:3d} | Loss {} | Accuracy {:.3f}".format(

epoch, avg_loss, 100 * correct / total))

model.eval()

loss = 0.0

correct = 0

total = 0

with torch.no_grad():

for i_batch, (inputs, labels) in enumerate (test_loader):

inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device)

outputs = model(inputs)

loss = criterion(outputs, labels)

predicted = (outputs > 0.5).float()

total += labels.size(0)

correct += (predicted == labels).sum().item()

loss += loss.item() * len(inputs)

avg_loss = loss / total

print("Testing Accuracy {}, Loss {}".format(100*correct/total, avg_loss))

# Final evaluation

model.eval()

loss = 0.0

correct = 0

total = 0

y_pred = []

with torch.no_grad():

for i_batch, (inputs, labels) in enumerate (test_loader):

inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device)

outputs = model(inputs)

loss = criterion(outputs, labels)

predicted = (outputs > 0.5).float()

total += labels.size(0)

correct += (predicted == labels).sum().item()

loss += loss.item()

for i in range(len(predicted)):

y_pred.append(predicted[i].cpu().numpy())

print("Testing Accuracy {}, Loss {}".format(100*correct/total, loss))

return y_pred

y_pred = classifier(X_train, y_train, X_test, y_test, 2)

confusion_matrix(y_test, y_pred)

</code>

<code>device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def processDataNN(X_train, y_train, X_test, y_test, batch_size): min_vals = np.min(X_train, axis=0) max_vals = np.max(X_train, axis=0) print(X_train) # Normalize the data X_train = (X_train - min_vals) / (max_vals - min_vals) X_test = (X_test - min_vals) / (max_vals - min_vals) tensor_X_train = torch.Tensor(X_train) tensor_y_train = torch.Tensor(y_train) tensor_X_test = torch.Tensor(X_test) tensor_y_test = torch.Tensor(y_test) train_dataset = TensorDataset(tensor_X_train, tensor_y_train) train_dataloader = DataLoader(train_dataset,batch_size = batch_size, shuffle = True) test_dataset = TensorDataset(tensor_X_test,tensor_y_test) test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False) return (train_dataloader, test_dataloader) class TextClassifierNeuralNetwork(nn.Module): def __init__(self, vector_len, num_classes=1): super(TextClassifierNeuralNetwork, self).__init__() self.layer1 = nn.Sequential(nn.Linear(vector_len, 128), nn.Dropout(0.5), nn.ReLU(inplace=True)) self.layer2 = nn.Sequential(nn.Linear(128, 64), nn.Dropout(0.5), nn.ReLU(inplace=True)) self.layer3 = nn.Sequential(nn.Linear(64, 1), nn.Sigmoid()) def forward(self, x): out = self.layer1(x) out = self.layer2(out) out = self.layer3(out) return out def classifier(X_train, y_train, X_test, y_test, type): n_epochs = 10 batch_size = 8 train_loader, test_loader = processDataNN(X_train, y_train, X_test, y_test, batch_size) number, dim = X_train.shape model = TextClassifierNeuralNetwork(dim).to(device) criterion = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=0.001) for epoch in range(n_epochs): model.train() loss = 0.0 correct = 0 total = 0 for i_batch, (inputs, labels) in enumerate (train_loader): inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device) optimizer.zero_grad() outputs = model(inputs) outputs = outputs.to(torch.float) loss = criterion(outputs, labels) loss.backward() optimizer.step() predicted = (outputs > 0.5).float() total += labels.size(0) correct += (predicted == labels).sum().item() loss += loss.item() * len(inputs) avg_loss = loss / total print("| Epoch {:3d} | Loss {} | Accuracy {:.3f}".format( epoch, avg_loss, 100 * correct / total)) model.eval() loss = 0.0 correct = 0 total = 0 with torch.no_grad(): for i_batch, (inputs, labels) in enumerate (test_loader): inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device) outputs = model(inputs) loss = criterion(outputs, labels) predicted = (outputs > 0.5).float() total += labels.size(0) correct += (predicted == labels).sum().item() loss += loss.item() * len(inputs) avg_loss = loss / total print("Testing Accuracy {}, Loss {}".format(100*correct/total, avg_loss)) # Final evaluation model.eval() loss = 0.0 correct = 0 total = 0 y_pred = [] with torch.no_grad(): for i_batch, (inputs, labels) in enumerate (test_loader): inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device) outputs = model(inputs) loss = criterion(outputs, labels) predicted = (outputs > 0.5).float() total += labels.size(0) correct += (predicted == labels).sum().item() loss += loss.item() for i in range(len(predicted)): y_pred.append(predicted[i].cpu().numpy()) print("Testing Accuracy {}, Loss {}".format(100*correct/total, loss)) return y_pred y_pred = classifier(X_train, y_train, X_test, y_test, 2) confusion_matrix(y_test, y_pred) </code>

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def processDataNN(X_train, y_train, X_test, y_test, batch_size):

  min_vals = np.min(X_train, axis=0)
  max_vals = np.max(X_train, axis=0)
  print(X_train)

  # Normalize the data
  X_train = (X_train - min_vals) / (max_vals - min_vals)
  X_test = (X_test - min_vals) / (max_vals - min_vals)

  tensor_X_train = torch.Tensor(X_train)
  tensor_y_train = torch.Tensor(y_train)
  tensor_X_test = torch.Tensor(X_test)
  tensor_y_test = torch.Tensor(y_test)

  train_dataset = TensorDataset(tensor_X_train, tensor_y_train)
  train_dataloader = DataLoader(train_dataset,batch_size = batch_size, shuffle = True)
  test_dataset = TensorDataset(tensor_X_test,tensor_y_test)
  test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

  return (train_dataloader, test_dataloader)



class TextClassifierNeuralNetwork(nn.Module):
  def __init__(self, vector_len, num_classes=1):
    super(TextClassifierNeuralNetwork, self).__init__()
    self.layer1 = nn.Sequential(nn.Linear(vector_len, 128),
                                nn.Dropout(0.5),
                                nn.ReLU(inplace=True))
    self.layer2 = nn.Sequential(nn.Linear(128, 64),
                                nn.Dropout(0.5),
                                nn.ReLU(inplace=True))
    self.layer3 = nn.Sequential(nn.Linear(64, 1),
                                nn.Sigmoid())

  def forward(self, x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = self.layer3(out)
    return out




def classifier(X_train, y_train, X_test, y_test, type):
   n_epochs = 10
   batch_size = 8
   train_loader, test_loader = processDataNN(X_train, y_train, X_test, y_test, batch_size)
   number, dim = X_train.shape
   model = TextClassifierNeuralNetwork(dim).to(device)

   criterion = nn.BCELoss()
   optimizer = optim.Adam(model.parameters(), lr=0.001)

  
   for epoch in range(n_epochs):
     model.train()
     loss = 0.0
     correct = 0
     total = 0

     for i_batch, (inputs, labels) in enumerate (train_loader):
       inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device)
       optimizer.zero_grad()
       outputs = model(inputs)
       outputs = outputs.to(torch.float)
       loss = criterion(outputs, labels)
       loss.backward()
       optimizer.step()

       predicted = (outputs > 0.5).float()

       total += labels.size(0)
       correct += (predicted == labels).sum().item()
       loss += loss.item() * len(inputs)

     avg_loss = loss / total
     print("| Epoch {:3d} | Loss {} | Accuracy {:.3f}".format(
                   epoch, avg_loss, 100 * correct / total))

     
     model.eval()
     loss = 0.0
     correct = 0
     total = 0
     with torch.no_grad():
       for i_batch, (inputs, labels) in enumerate (test_loader):
         inputs, labels = inputs.to(device), labels.unsqueeze(1).to(device)
         outputs = model(inputs)
         loss = criterion(outputs, labels)
         predicted = (outputs > 0.5).float()
         total += labels.size(0)
         correct += (predicted == labels).sum().item()
         loss += loss.item() * len(inputs)
       avg_loss = loss / total
       print("Testing Accuracy {}, Loss {}".format(100*correct/total, avg_loss))

    # Final evaluation
    model.eval()
    loss = 0.0
    correct = 0
    total = 0
    y_pred = []
    with torch.no_grad():
      for i_batch, (inputs, labels) in enumerate (test_loader):
        inputs, labels = inputs.to(device), labels.float().unsqueeze(1).to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        loss += loss.item()
        for i in range(len(predicted)):
          y_pred.append(predicted[i].cpu().numpy())
      print("Testing Accuracy {}, Loss {}".format(100*correct/total, loss))

  return y_pred


y_pred = classifier(X_train, y_train, X_test, y_test, 2)
confusion_matrix(y_test, y_pred)

This gives output:

<code> Epoch 0 | Loss 4.785542478202842e-05 | Accuracy 91.136

Testing Accuracy 86.6, Loss 0.0056429700925946236

| Epoch 1 | Loss 1.404969225404784e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006202427204698324

| Epoch 2 | Loss 1.600237374077551e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006220217794179916

| Epoch 3 | Loss 0.00048047976451925933 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006028376519680023

| Epoch 4 | Loss 1.1431842722231522e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.00612490251660347

| Epoch 5 | Loss 2.435541318845935e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.00621108990162611

| Epoch 6 | Loss 3.797198587562889e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006150437518954277

| Epoch 7 | Loss 1.8876136891776696e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006087948102504015

| Epoch 8 | Loss 1.8229215129395016e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006090217735618353

| Epoch 9 | Loss 2.1310337615432218e-05 | Accuracy 91.525

Testing Accuracy 86.6, Loss 0.006059492938220501

Testing Accuracy 86.6, Loss 1.3465540409088135

array([[866, 0],

[134, 0]])

</code>

 Epoch   0 | Loss 4.785542478202842e-05 | Accuracy 91.136
Testing Accuracy 86.6, Loss 0.0056429700925946236
| Epoch   1 | Loss 1.404969225404784e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006202427204698324
| Epoch   2 | Loss 1.600237374077551e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006220217794179916
| Epoch   3 | Loss 0.00048047976451925933 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006028376519680023
| Epoch   4 | Loss 1.1431842722231522e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.00612490251660347
| Epoch   5 | Loss 2.435541318845935e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.00621108990162611
| Epoch   6 | Loss 3.797198587562889e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006150437518954277
| Epoch   7 | Loss 1.8876136891776696e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006087948102504015
| Epoch   8 | Loss 1.8229215129395016e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006090217735618353
| Epoch   9 | Loss 2.1310337615432218e-05 | Accuracy 91.525
Testing Accuracy 86.6, Loss 0.006059492938220501
Testing Accuracy 86.6, Loss 1.3465540409088135
array([[866,   0],
       [134,   0]])

I have tried adding Dropout, lowering the learning rate and just overall debugging the code, but nothing helped and I can’t find what the problem is. Could it be that the imbalance causes this? There are 10056 normal tweets and 985 tweets with cyberbullying which is about 9% of the dataset. If it is the imbalance, how can I fight this?

Thiết kế website giá rẻ

Danh mục

PyTorch model classifies everything as one class