I wanted to train LSTM Model for Next Word Prediction using word-based and character based. I used similar data processing technique for both character-based and word-based.
For Character,
class TextDataset(Dataset):
def __init__(self, text, sequence_length):
self.text = text
self.sequence_length = sequence_length
self.text_length = len(text) - sequence_length
def __len__(self):
return self.text_length
def __getitem__(self, idx):
seq = self.text[idx: idx + self.sequence_length]
next_char = self.text[idx + self.sequence_length]
return torch.tensor(seq['tokens'], dtype=torch.long), torch.tensor(next_char['tokens'], dtype=torch.long)
sequence_length = 20
train_data = TextDataset(tokenized_dataset['train'], sequence_length)
valid_data = TextDataset(tokenized_dataset['valid'], sequence_length)
test_data = TextDataset(tokenized_dataset['test'], sequence_length)
train_dataloader = DataLoader(train_data, batch_size=1024, shuffle=False)
valid_dataloader = DataLoader(valid_data, batch_size=1024, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=1024, shuffle=False)
For Word-based, I took word instead of Character.
Here is LSTM model,
class LSTMModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, dropout_rate):
super(LSTMModel, self).__init__()
self.num_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.dropout = nn.Dropout(dropout_rate)
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout_rate, batch_first=True)
self.fc = nn.Linear(hidden_dim, vocab_size)
def forward(self, x, hidden):
x = self.embedding(x)
# x = x.view(self.sequence_length, -1, self.hidden_dim)
# print(x.shape)
out, hidden = self.lstm(x, hidden)
out = self.dropout(out)
out = self.fc(out[:,-1])
return out, hidden
# def init_hidden(self, batch_size):
# return (torch.zeros(1, batch_size, self.hidden_dim),
# torch.zeros(1, batch_size, self.hidden_dim))
def init_hidden(self, batch_size, device):
hidden = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
cell = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
return hidden, cell
def detach_hidden(self, hidden):
hidden, cell = hidden
hidden = hidden.detach()
cell = cell.detach()
return hidden, cell
And Training Step,
for epoch in range(n_epochs):
for inputs, targets in tqdm(train_dataloader):
hidden = model.init_hidden(inputs.shape[0], device)
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
output, hidden = model(inputs, hidden)
loss = criterion(output, targets)
loss.backward()
optimizer.step()
model.detach_hidden(hidden)
print(f"Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}")
Here, Character-based training takes much longer time than Word-based training. Why?