I was trying to train a Barlow twins model for image classification. Nonetheless, I encountered a problem after finishing my model training. It seems that the model has become a constant always returning the number 2046 with a slightly variable decimal part no matter how different the two giving images are.
The model tries to minimize the cross correlation matrix to the identity matrix.
is there a way to overcome this problem.
def off_diagonal(x):
# return a flattened view of the off-diagonal elements of a square matrix
class BarlowTwins(nn.Module):
def __init__(self, lambd ,batch_size):
super().__init__()
self.batch_size = batch_size
self.lambd = lambd
self.backbone = torchvision.models.resnet34(zero_init_residual=True, weights='DEFAULT')
self.backbone.fc = nn.Identity()
self.sizes = [512,2048,2048,2048]
# projector
_sizes = [512,4096,4096,4096]
layers = []
for i in range(len(self.sizes) - 2):
layers.append(nn.Linear(self.sizes[i], self.sizes[i + 1], bias=False))
layers.append(nn.BatchNorm1d(self.sizes[i + 1]))
layers.append(nn.ReLU(inplace=True))
layers.append(nn.Linear(self.sizes[-2], self.sizes[-1], bias=False))
self.projector = nn.Sequential(*layers)
# normalization layer for the representations z1 and z2
self.bn = nn.BatchNorm1d(self.sizes[-1], affine=False)
def forward(self, y1, y2):
z1 = self.projector(self.backbone(y1))
z2 = self.projector(self.backbone(y2))
# empirical cross-correlation matrix
c = self.bn(z1).T @ self.bn(z2)
# sum the cross-correlation matrix between all gpus
c.div_(self.batch_size)
on_diag = torch.diagonal(c).add_(-1).pow_(2).sum()
# print('c', c)
val = torch.diagonal(c).sum()
off_diag = off_diagonal(c).pow_(2).sum()
# print('off_diag', off_diag)
loss = on_diag + self.lambd * off_diag
return loss, val
from tqdm import tqdm
def intiate_p(model, epoch_n, loader, print_freq, lr, momentum, weight_decay):
epoch_tqdm = tqdm(range(epoch_n))
param_weights = []
param_biases = []
r = print_freq
func = model
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.PolynomialLR(optimizer, total_iters=40, power=2.0)
losses = []
# target = torch.tensor(2048, dtype=torch.float32)
start_time = time.time()
stats_file = open('stats.txt', 'a', buffering=1)
for epoch in epoch_tqdm:
for step, ((y1, y2), _) in enumerate(loader, start=epoch * len(loader)):
optimizer.zero_grad()
loss = func.forward(y1, y2)[0]
losses.append(loss)
loss.backward()
optimizer.step()
scheduler.step()
epoch_tqdm.set_description(f"the Loss is: {abs(loss -2048)} " )
return losses
The problem here is that I am not sure whether my approach for the evaluation of the model was correct. because I randomly fed two images as y1 and y2 to my model for one 100 iterations but the results remain constant.
Side notes: I have tried many different values for training variables and the best loss I could get was 100.
md = BarlowTwins(batch_size=64, lambd=0.005)
t = intiate_p(model=md, epoch_n=20, loader=loader,lr=0.4,momentum=0.3 ,print_freq=10, weight_decay=0.0001)
# the loss converges to about a 100 while it started from around 2000