Thiết kế website giá rẻ

Question

I’m trying to create a custom dataset in PyTorch that is similar to MNIST but with my own images. However, I’m having difficulties in preparing the dataset and possibly in the architecture of the seq2seq(encoder-decoder).

My current code for creating the custom dataset looks like this:

class custom_dataset(Dataset):
    def __init__(self, root, transform=None, normalize=True):
        self.root = root
        self.transform = transform
        self.normalize = normalize
        self.data = []
        
        for digit in range(10): # Go through the folders from 0 to 9
            class_path = os.path.join(self.root, str(digit))
            for img_path in glob.glob(os.path.join(class_path, "*.jpg")):
                self.data.append([img_path, digit])
                                 
    def __getitem__(self, idx):
        img_path, label = self.data[idx]
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image) 
        if self.normalize:
            image = (image - 0.5) / 0.5 

        return image, torch.tensor(label)

    def __len__(self):
        return len(self.data)



transform = transforms.Compose([
    transforms.Resize((28, 28)), 
    transforms.ToTensor(), 
    transforms.Grayscale(num_output_channels=1)  
])


train_dataset = custom_dataset(root='/home/encoder/Train', transform=transform)
test_dataset = custom_dataset(root='/home/encoder/Test', transform=transform)

imagenet_data  = DataLoader(train_dataset, batch_size=8, shuffle=True)
imagenet_data_test  = DataLoader(test_dataset, batch_size=8, shuffle=False)

And here’s an example of my encoder-decoder architecture:

class encoder_decoder(nn.Module):
  def __init__(self, output):
    super().__init__()
    # total tokens 
    self.output = output
    self.maxpool1 = torch.nn.MaxPool2d(2)
    self.maxpool2 = torch.nn.MaxPool2d(2)
    self.cnn1 = torch.nn.Conv2d(1, 8, 3, )
    #self.bn1 = nn.BatchNorm2d(8)
    self.cnn2 = torch.nn.Conv2d(8, 8, 3)
    #self.bn2 = nn.BatchNorm2d(8)
    # final conv layer of 4 channels
    self.cnn3 = torch.nn.Conv2d(8, 4, 3, )
    #self.bn3 = nn.BatchNorm2d(4)
    # 2 layer gru with 32 units
    self.encgru = nn.GRU(36, 32, 2, batch_first=True, dropout=0.1) # Изменена input_size
    # for inputing one hot encoded digits
    self.emb = nn.Embedding(self.output, 8)
    # 2 layer gru with 32 units
    self.decgru = nn.GRU(8, 32, 2, batch_first=True, dropout=0.1)
    # timeshared linear layer
    self.Linear = nn.Linear(32, self.output, bias=True, )

  def forward(self, x, val):
    x = self.cnn1(x)
    x = nn.functional.relu(x)
    x = self.maxpool1(x)
    x = self.cnn2(x)
    x = nn.functional.relu(x)
    x = self.maxpool2(x)
    x = self.cnn3(x)
    x = nn.functional.relu(x)
    batch, channel, time, emb = x.shape
    print(f"x shape after CNN: {x.shape}")
    # concatenating along the y axis
    # x=x.permute(0,2,1,3).reshape(batch,time,emb*channel)
    # x = x.permute(0, 2, 1, 3).reshape(batch, time, emb * channel // 2)
    x = x.permute(0, 2, 1, 3).reshape(batch, time, channel * emb // 2) #  Divide by 2 to get 32
    print(f"x shape after reshape: {x.shape}")
    # only hidden state is passed to decoder
    _, hidden = self.encgru(x)

    # Checking that val already has the required size
    batch, time = val.shape 
    print(f"val shape: {val.shape}")

    # val = val.unsqueeze(2) 
    # val = val.repeat(1, 1, self.output)

    x = self.emb(val)
    x = nn.functional.relu(x)
    x = x.squeeze(2)
    x, _ = self.decgru(x, hidden)
    x = nn.functional.relu(x)
    x = self.Linear(x.reshape(-1, 32))
    return x

  def predict(self, x):
    t = []
    x = self.cnn1(x)
    x = nn.functional.relu(x)
    x = self.maxpool1(x)
    x = self.cnn2(x)
    x = nn.functional.relu(x)
    x = self.maxpool2(x)
    x = self.cnn3(x)
    x = nn.functional.relu(x)
    batch, channel, time, emb = x.shape
    print(f"x shape after CNN: {x.shape}")
    # x=x.permute(0,2,1,3).reshape(batch,time,emb*channel // 2) 
    x = x.permute(0, 2, 1, 3).reshape(batch, time, emb * channel)
    print(f"x shape after reshape: {x.shape}")
    _, hidden = self.encgru(x)
    # <start> token index
    index = 10
    pred = [index]
    # maximum length is less than 12
    for _ in range(12):
      # Using the previous index to predict the next token
      input_token = torch.tensor([[[pred[-1]]]]).repeat(1, 1, self.output)
      x = self.emb(input_token)
      x = nn.functional.relu(x)
      x = x.squeeze(2)
      x, hidden = self.decgru(x, hidden)
      x = nn.functional.relu(x)
      x = self.Linear(x.reshape(-1, 32))
      index = torch.argmax(x, -1)[0]
      pred.append(index.item())
      # if <end> token then break loop
      if index == 11:
        break
    return pred

# total 13 tokens are used, includes 10 digits + <start> + <end> + "."
model = encoder_decoder(13)

My questions:

How can I properly prepare a custom dataset so that it is compatible with PyTorch and can be used for training?
I may have the wrong encoder-decoder architecture for this task. What architecture is better to use for creating an autoencoder similar to MNIST?

I would appreciate any help and recommendations!

Thiết kế website giá rẻ

Danh mục

How to build a custom dataset simillar MNIST in pytorch