For context, I’m using this tutorial for Image Captioning GitHub and I’m now trying to fine tune it, but to my dataset.
I just have one image and one caption and I’m trying to train the model to that, 1ºProblem I encountered was because I was using the BatchNorm1d and changed to InstanceNorm1d.
Now the error is the one on the title and this is the code.
The model loader (just in case anyone finds the problem here):
class FineTuneModel(nn.Module):
def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
super(FineTuneModel, self).__init__()
self.encoder = EncoderCNN(embed_size)
self.decoder = DecoderRNN(embed_size,
hidden_size,
vocab_size,
num_layers,
max_seq_length=20)
def forward(self, images, captions, lengths):
features = self.encoder(images)
outputs = self.decoder(features, captions, lengths)
return outputs
The encoder:
class EncoderCNN(nn.Module):
def __init__(self, embed_size):
super(EncoderCNN, self).__init__()
resnet = models.resnet152(pretrained=True)
modules = list(resnet.children())[:-1]
self.resnet = nn.Sequential(*modules)
self.linear = nn.Linear(resnet.fc.in_features, embed_size)
#self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
self.bn = nn.InstanceNorm1d(256, track_running_stats=True)
def forward(self, images):
with torch.no_grad():
features = self.resnet(images)
features = features.reshape(features.size(0), -1)
features = self.bn(self.linear(features))
return features
And this is the loader:
class CustomDataset(Dataset):
def __init__(self, image_folder, caption_folder, transform=None):
self.image_folder = image_folder
self.caption_folder = caption_folder
self.transform = transform
def __len__(self):
return 256
#return len(os.listdir(self.image_folder))
def __getitem__(self, idx):
#img_name = os.path.join(self.image_folder, str(idx) + '.jpg')
img_name = os.path.join(self.image_folder, "0" + '.jpg')
img = Image.open(img_name)
transform = transforms.Compose([
transforms.Resize((256,256)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
img = transform(img)
def tokenize(caption, vocabulary):
words = caption.split()
tokens = [vocabulary(word) for word in words]
return tokens
tokenized_captions = tokenize(caption, vocab)
print("Image shape:", img.shape)
return img, torch.tensor(tokenized_captions)
dataset = CustomDataset(image_folder='images/',
caption_folder='captions/',
transform=transforms.ToTensor())
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=False)
Thanks in advance.