I am creating CNN model to recognize dogs and cats. I trained it and when I evaluate accuracy of it by hand it has 80-85% accuracy on an unseen data.
But, when I try to use library torchmetrics.accuracy to calculate my accuracy then for some reason I get wrong accuracy calculations. Let me explain:
The code of the model(I use python, torch, lightning to optimize the model and code):
import lightning as L
import torch
import torchmetrics
import torchvision
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from torchvision.transforms import ToTensor
from CustomDataset import CustomDataset
class Model(L.LightningModule):
def __init__(self, batch_size, learning_rate, num_classes):
super(Model, self).__init__()
self.save_hyperparameters()
## HERE GOES MODEL LAYERS CRITERION etc
self.accuracy = torchmetrics.Accuracy(num_classes=2, average='macro', task='multiclass')
self.test_transform = transforms.Compose([
transforms.Resize((200, 200)), # Resize images to 256x256
transforms.ToTensor(), # Convert images to PyTorch tensors
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize images
])
self.transform = transforms.Compose([
transforms.RandomResizedCrop(200), # Randomly crops and resizes images to 224x224
transforms.RandomHorizontalFlip(p=0.5), # Randomly flips images horizontally
transforms.RandomRotation(15), # Resize images to 256x256
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
transforms.ToTensor(), # Convert images to PyTorch tensors
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize images
])
def forward(self, image):
image = F.relu(self.conv1(image))
image = self.pool(image)
image = F.relu(self.conv2(image))
image = self.pool(image)
image = F.relu(self.conv3(image))
image = self.pool(image) # Output is now (128, 25, 25)
image = torch.flatten(image, 1) # Flatten the output
image = F.relu(self.fc1(image))
image = self.fc2(image)
return image
def training_step(self, batch, batch_idx):
images, labels = batch
predictions = self(images) # Forward pass
loss = self.criterion(predictions, labels) # Compute the loss
predicted_classes = torch.argmax(F.softmax(predictions, dim=1), dim=1)
predictions_softmax = F.softmax(predictions, dim=1)
acc = self.accuracy(predictions_softmax, labels)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
return loss # Returning the loss for backpropagation
def validation_step(self, batch, batch_idx):
images, labels = batch
predictions = self(images)
loss = self.criterion(predictions, labels)
predicted_classes = torch.argmax(F.softmax(predictions, dim=1), dim=1)
predictions_softmax = F.softmax(predictions, dim=1)
acc = self.accuracy(predictions_softmax, labels)
self.log('val_loss', loss, prog_bar=True)
self.log('val_acc', acc, prog_bar=True)
return loss
def test_step(self, batch, batch_idx):
images, labels = batch
predictions = self(images) # Forward pass
loss = self.criterion(predictions, labels) # Compute the loss
predicted_classes = torch.argmax(F.softmax(predictions, dim=1), dim=1)
predictions_softmax = F.softmax(predictions, dim=1)
acc = self.accuracy(predictions_softmax, labels)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True)
return loss # Returning the loss for backpropagation
# images, labels = batch
# predictions = self(images)
# loss = self.criterion(predictions, labels)
# predicted_classes = torch.argmax(F.softmax(predictions, dim=1), dim=1)
# predictions_softmax = F.softmax(predictions, dim=1)
# acc = self.accuracy(predictions_softmax, labels)
# real_step_acc = (labels == predicted_classes).sum() / self.batch_size
# self.log('test_loss', loss, prog_bar=True)
# self.log('real_test_acc', real_step_acc, prog_bar=True)
# self.log('test_acc', acc, prog_bar=True)
# return loss
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9)
return optimizer
def train_dataloader(self):
# Set up and return the training DataLoader
filepath_train = "dataset/test/"
train_dataset = datasets.ImageFolder(root=filepath_train, transform=self.transform)
train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=16)
return train_loader
def test_dataloader(self):
# Set up and return the training DataLoader
filepath_train = "dataset/test/"
test_dataset = datasets.ImageFolder(root=filepath_train, transform=self.transform)
test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=True, num_workers=16)
return test_loader
def val_dataloader(self):
# Set up and return the validation DataLoader
filepath_train = "dataset/val/"
val_dataset = datasets.ImageFolder(root=filepath_train, transform=self.test_transform)
val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=16)
return val_loader
Output is like this:
train_acc_epoch 0.7635096907615662
real_test_acc 0.7901701927185059
test_acc 0.39825108647346497
Real test accuracy I compute like this:
predictions_softmax = F.softmax(predictions, dim=1)
acc = self.accuracy(predictions_softmax, labels)
real_step_acc = (labels == predicted_classes).sum() / self.batch_size
So the problem is:
When I run the testing the test accuracy inside test_step method is 40% but the real test accuracy that I compute myself is 80-85%.
so what I tried:
When I enable shuffling on test data(I know it is bad practice but it was part of the debugging), torchmetrics.accuracy becomes correct! It outputs 80-85% accuracy.
So why the shuffling changes the thing? Help me guys please, I think that it might also be some kind of bug.