I’m trying to create a custom dataset in PyTorch that is similar to MNIST but with my own images. However, I’m having difficulties in preparing the dataset and possibly in the architecture of the seq2seq(encoder-decoder).
My current code for creating the custom dataset looks like this:
class custom_dataset(Dataset):
def __init__(self, root, transform=None, normalize=True):
self.root = root
self.transform = transform
self.normalize = normalize
self.data = []
for digit in range(10): # Go through the folders from 0 to 9
class_path = os.path.join(self.root, str(digit))
for img_path in glob.glob(os.path.join(class_path, "*.jpg")):
self.data.append([img_path, digit])
def __getitem__(self, idx):
img_path, label = self.data[idx]
image = Image.open(img_path)
if self.transform:
image = self.transform(image)
if self.normalize:
image = (image - 0.5) / 0.5
return image, torch.tensor(label)
def __len__(self):
return len(self.data)
transform = transforms.Compose([
transforms.Resize((28, 28)),
transforms.ToTensor(),
transforms.Grayscale(num_output_channels=1)
])
train_dataset = custom_dataset(root='/home/encoder/Train', transform=transform)
test_dataset = custom_dataset(root='/home/encoder/Test', transform=transform)
imagenet_data = DataLoader(train_dataset, batch_size=8, shuffle=True)
imagenet_data_test = DataLoader(test_dataset, batch_size=8, shuffle=False)
And here’s an example of my encoder-decoder architecture:
class encoder_decoder(nn.Module):
def __init__(self, output):
super().__init__()
# total tokens
self.output = output
self.maxpool1 = torch.nn.MaxPool2d(2)
self.maxpool2 = torch.nn.MaxPool2d(2)
self.cnn1 = torch.nn.Conv2d(1, 8, 3, )
#self.bn1 = nn.BatchNorm2d(8)
self.cnn2 = torch.nn.Conv2d(8, 8, 3)
#self.bn2 = nn.BatchNorm2d(8)
# final conv layer of 4 channels
self.cnn3 = torch.nn.Conv2d(8, 4, 3, )
#self.bn3 = nn.BatchNorm2d(4)
# 2 layer gru with 32 units
self.encgru = nn.GRU(36, 32, 2, batch_first=True, dropout=0.1) # Изменена input_size
# for inputing one hot encoded digits
self.emb = nn.Embedding(self.output, 8)
# 2 layer gru with 32 units
self.decgru = nn.GRU(8, 32, 2, batch_first=True, dropout=0.1)
# timeshared linear layer
self.Linear = nn.Linear(32, self.output, bias=True, )
def forward(self, x, val):
x = self.cnn1(x)
x = nn.functional.relu(x)
x = self.maxpool1(x)
x = self.cnn2(x)
x = nn.functional.relu(x)
x = self.maxpool2(x)
x = self.cnn3(x)
x = nn.functional.relu(x)
batch, channel, time, emb = x.shape
print(f"x shape after CNN: {x.shape}")
# concatenating along the y axis
# x=x.permute(0,2,1,3).reshape(batch,time,emb*channel)
# x = x.permute(0, 2, 1, 3).reshape(batch, time, emb * channel // 2)
x = x.permute(0, 2, 1, 3).reshape(batch, time, channel * emb // 2) # Divide by 2 to get 32
print(f"x shape after reshape: {x.shape}")
# only hidden state is passed to decoder
_, hidden = self.encgru(x)
# Checking that val already has the required size
batch, time = val.shape
print(f"val shape: {val.shape}")
# val = val.unsqueeze(2)
# val = val.repeat(1, 1, self.output)
x = self.emb(val)
x = nn.functional.relu(x)
x = x.squeeze(2)
x, _ = self.decgru(x, hidden)
x = nn.functional.relu(x)
x = self.Linear(x.reshape(-1, 32))
return x
def predict(self, x):
t = []
x = self.cnn1(x)
x = nn.functional.relu(x)
x = self.maxpool1(x)
x = self.cnn2(x)
x = nn.functional.relu(x)
x = self.maxpool2(x)
x = self.cnn3(x)
x = nn.functional.relu(x)
batch, channel, time, emb = x.shape
print(f"x shape after CNN: {x.shape}")
# x=x.permute(0,2,1,3).reshape(batch,time,emb*channel // 2)
x = x.permute(0, 2, 1, 3).reshape(batch, time, emb * channel)
print(f"x shape after reshape: {x.shape}")
_, hidden = self.encgru(x)
# <start> token index
index = 10
pred = [index]
# maximum length is less than 12
for _ in range(12):
# Using the previous index to predict the next token
input_token = torch.tensor([[[pred[-1]]]]).repeat(1, 1, self.output)
x = self.emb(input_token)
x = nn.functional.relu(x)
x = x.squeeze(2)
x, hidden = self.decgru(x, hidden)
x = nn.functional.relu(x)
x = self.Linear(x.reshape(-1, 32))
index = torch.argmax(x, -1)[0]
pred.append(index.item())
# if <end> token then break loop
if index == 11:
break
return pred
# total 13 tokens are used, includes 10 digits + <start> + <end> + "."
model = encoder_decoder(13)
My questions:
- How can I properly prepare a custom dataset so that it is compatible with PyTorch and can be used for training?
- I may have the wrong encoder-decoder architecture for this task. What architecture is better to use for creating an autoencoder similar to MNIST?
I would appreciate any help and recommendations!