import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import sentencepiece as spm
# Define PositionalEncoding, MultiHeadAttention, PositionwiseFeedforward,
# TransformerEncoderLayer, TransformerDecoderLayer, and Transformer classes
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.depth = d_model // num_heads
self.query = nn.Linear(d_model, d_model)
self.key = nn.Linear(d_model, d_model)
self.value = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_len = query.size(1)
query = self.query(query).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
key = self.key(key).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
value = self.value(value).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)
if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1) # Ensure mask is broadcastable
scores = scores.masked_fill(mask == 0, float('-inf'))
attention_weights = F.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)
context = torch.matmul(attention_weights, value)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
return context, attention_weights
class PositionwiseFeedforward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.dropout(F.relu(self.fc1(x)))
x = self.fc2(x)
return x
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, src, src_mask=None):
src2, _ = self.self_attn(src, src, src, src_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.ffn(src)
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerDecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.src_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
tgt2, self_attention_weights = self.self_attn(tgt, tgt, tgt, tgt_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
tgt2, src_attention_weights = self.src_attn(tgt, memory, memory, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.ffn(tgt)
tgt = tgt + self.dropout3(tgt2)
tgt = self.norm3(tgt)
return tgt, self_attention_weights, src_attention_weights
class Transformer(nn.Module):
def __init__(self, num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout=0.1):
super(Transformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])
self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])
self.fc_out = nn.Linear(d_model, vocab_size)
self.d_model = d_model
def forward(self, src, tgt, src_mask=None, tgt_mask=None):
src = self.embedding(src) * math.sqrt(self.d_model)
src = self.pos_encoder(src)
tgt = self.embedding(tgt) * math.sqrt(self.d_model)
tgt = self.pos_encoder(tgt)
for layer in self.encoder_layers:
src = layer(src, src_mask)
memory = src
for layer in self.decoder_layers:
tgt, _, _ = layer(tgt, memory, tgt_mask, src_mask)
output = self.fc_out(tgt)
return output
class ConversationDataset(Dataset):
def __init__(self, conversations, sp_model):
self.conversations = conversations
self.sp_model = sp_model
def __len__(self):
return len(self.conversations)
def __getitem__(self, idx):
input_text, target_text = self.conversations[idx]
input_tensor = self.sp_model.encode(input_text, out_type=int)
target_tensor = self.sp_model.encode(target_text, out_type=int)
return input_tensor, target_tensor
def pad_sequence(seq, max_len, pad_value):
return seq + [pad_value] * (max_len - len(seq))
def collate_fn(batch, pad_token=0):
input_seqs, target_seqs = zip(*batch)
max_input_len = max(len(seq) for seq in input_seqs)
max_target_len = max(len(seq) for seq in target_seqs)
input_seqs = [pad_sequence(seq, max_input_len, pad_token) for seq in input_seqs]
target_seqs = [pad_sequence(seq, max_target_len, pad_token) for seq in target_seqs]
input_seqs = torch.tensor(input_seqs, dtype=torch.long)
target_seqs = torch.tensor(target_seqs, dtype=torch.long)
return input_seqs, target_seqs
def train_model(model, dataloader, num_epochs, learning_rate, vocab_size):
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=0)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)
for epoch in range(num_epochs):
model.train()
total_loss = 0
pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
for src, tgt in pbar:
optimizer.zero_grad()
src = src.transpose(0, 1)
tgt_input = tgt[:, :-1].transpose(0, 1)
tgt_output = tgt[:, 1:].transpose(0, 1)
src_mask = generate_square_subsequent_mask(src.size(0)).to(src.device)
tgt_mask = generate_square_subsequent_mask(tgt_input.size(0)).to(tgt_input.device)
src_mask = src_mask.unsqueeze(0)
tgt_mask = tgt_mask.unsqueeze(0)
output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
output = output.transpose(0, 1).contiguous().view(-1, vocab_size)
tgt_output = tgt_output.contiguous().view(-1)
loss = criterion(output, tgt_output)
loss.backward()
optimizer.step()
total_loss += loss.item()
pbar.set_postfix({'loss': total_loss / len(pbar)})
scheduler.step(total_loss / len(dataloader))
def generate_square_subsequent_mask(sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def generate_response(model, sp_model, input_text, max_len=20):
model.eval()
with torch.no_grad():
src_tokens = sp_model.encode(input_text, out_type=int)
src_tensor = torch.tensor([src_tokens], dtype=torch.long)
src_mask = (src_tensor != 0).unsqueeze(-2)
memory = model.encoder(src_tensor, src_mask)
# Start decoding with "<s>" token
tgt_token = [sp_model.bos_id()]
for i in range(max_len):
tgt_tensor = torch.tensor([tgt_token], dtype=torch.long)
tgt_mask = (tgt_tensor != 0).unsqueeze(-2)
output = model.decoder(tgt_tensor, memory, tgt_mask, src_mask)
output = torch.argmax(output, dim=-1)
token = output[0, -1].item()
if token == sp_model.eos_id():
break
tgt_token.append(token)
output_text = sp_model.decode_ids(tgt_token)
return output_text
# Load SentencePiece model
sp_model = spm.SentencePieceProcessor(model_file='m.model')
# Example conversation dataset
conversations = [
("hello how are you", "i am fine"),
("what is your name", "my name is bot"),
("how old are you", "i am 2 years old")
]
# Create dataset and dataloader
dataset = ConversationDataset(conversations, sp_model)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
# Model parameters
num_encoder_layers = 6
num_decoder_layers = 6
vocab_size = sp_model.get_piece_size() # Get actual vocab size from SentencePiece model
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
learning_rate = 0.0001
num_epochs = 10
# Initialize and train the model
model = Transformer(num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout)
train_model(model, dataloader, num_epochs, learning_rate, vocab_size)
# Test inference
input_text = "hello how are you"
response = generate_response(model, sp_model, input_text)
print(f"Input: {input_text}nResponse: {response}")
>>> %Run ai.py
/home/shaykhul/.local/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:28: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
warnings.warn("The verbose parameter is deprecated. Please use get_last_lr() "
Epoch 1/10: 0%| | 0/2 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/shaykhul/Desktop/ai.py", line 272, in <module>
train_model(model, dataloader, num_epochs, learning_rate, vocab_size)
File "/home/shaykhul/Desktop/ai.py", line 196, in train_model
output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 137, in forward
src = layer(src, src_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 87, in forward
src2, _ = self.self_attn(src, src, src, src_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 55, in forward
scores = scores.masked_fill(mask == 0, float('-inf'))
RuntimeError: The size of tensor a (13) must match the size of tensor b (2) at non-singleton dimension 4
I can’t fix this error. I tried different approaches and even ChatGP, but it was not fixed. The error occurred due to MultiHeadAttention dimension begin mismatched. I want to build a simple personal chat-bot with an AI girlfriend. So please help me to fix the error. I want to make my own custom transformer model, and train it with my own custom data sets.
Md. Shaykhul Islam is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
This issue has been discussed at https://discuss.pytorch.org/t/torch-optim-lr-scheduler-py-28-userwarning-the-verbose-parameter-is-deprecated/196634 too. And the solution given is
Don’t pass the verbose argument to the scheduler as explained in the warning.
and
.get_last_lr() is the new and recommended way to check the current learning rate. I don’t know enough about lightning and how they are printing the lr during the training.
like the warning you received suggested.
This is your problematic line: scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)
.