I’m losing my mind here. I’ve been working on this stupid object detection model for weeks, and now that I’m finally training it, something’s clearly messed up. The training loop is running, epochs are ticking by, but every single loss value is a big fat ZERO. What gives? The code below me is an implementation of the EAST model and the reset 50 to help me out to localize my training data. Supposedly, I indented to be a simple version of it, expecting 4 bndboxes, for ymin, ymin, ymax, ymax
, and the predicted class
, and tweak it so it seems like and works for my old dataset loader that i used for my ssd training practice.
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.models as models
import torchvision.transforms as transforms
from typing import Tuple
import logging
from pathlib import Path
# Set up logging configuration
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler("model.log", mode="w")
]
)
logger = logging.getLogger(__name__)
class REAST(nn.Module):
def __init__(self, hidden_size: int = 256, input_channels: int = 1, num_classes: int = 35):
super(REAST, self).__init__()
logger.info("Initializing REAST model")
self.num_classes = num_classes # Save num_classes as an instance variable
# Initialize ResNet50 backbone without pretrained weights
self.backbone = models.resnet50(weights=None)
# Modify the first convolution layer to accept custom input channels
self.backbone.conv1 = nn.Conv2d(input_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])
self.merge = nn.Sequential(
nn.Conv2d(2048, hidden_size, 1),
nn.BatchNorm2d(hidden_size),
nn.ReLU(inplace=False)
)
self.det_head = nn.Sequential(
nn.Conv2d(hidden_size, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=False),
nn.Conv2d(128, 4 + num_classes, 1) # 4 for bbox, num_classes for class scores
)
logger.info("REAST model initialized")
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
features = self.backbone(x)
merged = self.merge(features)
det_output = self.det_head(merged)
batch_size, _, height, width = det_output.shape
det_output = det_output.permute(0, 2, 3, 1).contiguous()
det_output = det_output.view(batch_size, -1, 4 + self.num_classes)
bbox = torch.sigmoid(det_output[..., :4])
class_scores = det_output[..., 4:]
return bbox, class_scores
class REASTLoss(nn.Module):
"""
Bounding Box Regression Loss for the REAST model.
This loss computes the Smooth L1 Loss between the predicted bounding boxes
and the ground truth bounding boxes.
Args:
beta (float): Transition point from L1 to L2 loss. Default is 1.0.
"""
def __init__(self, beta: float = 1.0):
super(REASTLoss, self).__init__()
self.bbox_loss_fn = nn.SmoothL1Loss(beta=beta, reduction='mean') # Regression loss for bounding boxes
self.class_loss_fn = nn.CrossEntropyLoss() # Classification loss for class scores
def forward(
self,
bbox_pred: torch.Tensor,
bbox_target: torch.Tensor,
class_scores_pred: torch.Tensor,
class_target: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Computes the bounding box regression loss and class classification loss.
Args:
bbox_pred (torch.Tensor): Predicted bounding boxes of shape [batch_size, num_predictions, 4].
bbox_target (torch.Tensor): Ground truth bounding boxes of shape [batch_size, num_predictions, 4].
class_scores_pred (torch.Tensor): Predicted class scores of shape [batch_size, num_predictions, num_classes].
class_target (torch.Tensor): Ground truth class labels of shape [batch_size, num_predictions].
Returns:
Tuple[torch.Tensor, torch.Tensor]: The average bounding box regression loss and class classification loss.
"""
# Flatten the tensors to shape [batch_size * num_predictions, 4]
bbox_pred = bbox_pred.view(-1, 4)
bbox_target = bbox_target.view(-1, 4)
# Compute Smooth L1 Loss for bounding boxes
bbox_loss = self.bbox_loss_fn(bbox_pred, bbox_target)
# Flatten class scores and targets for classification loss
class_scores_pred_flat = class_scores_pred.view(-1, class_scores_pred.size(-1))
class_target_flat = class_target.view(-1)
# Compute Cross-Entropy Loss for class scores
class_loss = self.class_loss_fn(class_scores_pred_flat, class_target_flat)
return bbox_loss, class_loss
if __name__ == "__main__":
# Define paths
images_path: Path = Path("/kaggle/input/license-plate-individual-character-recognition/dataset/test/images")
annotations_path: Path = Path("/kaggle/input/license-plate-individual-character-recognition/dataset/test/annotations")
# Define transformations for images
transform: transforms.Compose = transforms.Compose([
transforms.Resize((150, 300)),
transforms.ToTensor()
])
# Initialize dataset and DataLoader
dataset: PairedDataset = PairedDataset(images_path, annotations_path, transform=transform)
dataloader: DataLoader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)
# Initialize model with custom arguments
model = REAST(hidden_size=256, input_channels=1, num_classes=35)
for batch_idx, (images, targets) in enumerate(dataloader):
if images is None:
continue
logger.info(f"Processing batch {batch_idx + 1}/{len(dataloader)}")
bbox_pred, class_pred = model(images)
print(f"Batch {batch_idx + 1}:")
print(f"Bounding box predictions shape: {bbox_pred}")
print(f"Class predictions shape: {class_pred}")
logger.info("Data pass completed")
logger.info("Script execution completed")
Bounding box predictions shape: tensor([[[0.4904, 0.4826, 0.4331, 0.4461],
[0.4737, 0.4718, 0.3873, 0.4984],
[0.5318, 0.4085, 0.4605, 0.4976],
...,
[0.5346, 0.3973, 0.4157, 0.4013],
[0.5404, 0.5716, 0.4592, 0.4304],
[0.4845, 0.4635, 0.4642, 0.3435]],
[[0.5568, 0.5016, 0.4058, 0.4803],
[0.4984, 0.4541, 0.4168, 0.4797],
[0.5031, 0.4488, 0.4707, 0.4315],
...,
[0.4705, 0.4993, 0.4731, 0.5142],
[0.5195, 0.4995, 0.4252, 0.4137],
[0.3814, 0.4758, 0.4396, 0.5100]],
[[0.5826, 0.5382, 0.4726, 0.4778],
[0.5058, 0.3366, 0.4418, 0.4357],
[0.4882, 0.3945, 0.3619, 0.5942],
...,
[0.5228, 0.6060, 0.3055, 0.4768],
[0.4904, 0.4193, 0.3649, 0.5844],
[0.4747, 0.5737, 0.3878, 0.4274]],
...,
[[0.5031, 0.4971, 0.4134, 0.4165],
[0.5909, 0.4793, 0.3946, 0.5088],
[0.5375, 0.4800, 0.2840, 0.4707],
...,
[0.5572, 0.5223, 0.4603, 0.3975],
[0.5731, 0.4606, 0.4361, 0.3216],
[0.5479, 0.5747, 0.4761, 0.4908]],
[[0.5523, 0.4674, 0.3827, 0.4398],
[0.4980, 0.4429, 0.4330, 0.3433],
[0.5834, 0.5447, 0.4296, 0.4391],
...,
[0.6650, 0.4623, 0.3685, 0.5081],
[0.6243, 0.4948, 0.4134, 0.3126],
[0.4504, 0.5594, 0.4091, 0.3786]],
[[0.4753, 0.5499, 0.4310, 0.4275],
[0.5903, 0.4496, 0.3770, 0.5092],
[0.5353, 0.5013, 0.3369, 0.4931],
...,
[0.4129, 0.5560, 0.4468, 0.5077],
[0.4111, 0.4598, 0.5210, 0.3679],
[0.4886, 0.4824, 0.4739, 0.3538]]], grad_fn=<SigmoidBackward0>)
Class predictions shape: tensor([[[-0.0806, -0.2863, -0.0093, ..., 0.1453, 0.1619, -0.2535],
[ 0.3528, -0.0909, 0.1466, ..., 0.3294, 0.2627, 0.1187],
[-0.1872, -0.1111, -0.4086, ..., 0.3502, 0.1019, -0.0336],
...,
[ 0.0920, -0.5760, 0.2177, ..., 0.1404, 0.0144, -0.2411],
[-0.5110, -0.7292, 0.4410, ..., -0.0231, -0.3882, 0.0686],
[ 0.0444, -0.3606, -0.0462, ..., -0.0270, -0.2601, 0.1730]],
[[ 0.3305, -0.1494, 0.0860, ..., 0.3980, 0.3394, 0.1777],
[ 0.2288, 0.2995, -0.1177, ..., 0.4713, -0.1875, 0.3246],
[ 0.0197, -0.0395, -0.2418, ..., 0.3535, 0.1793, 0.1677],
...,
[-0.1047, -0.0050, -0.0938, ..., 0.2939, -0.4493, -0.3131],
[ 0.3255, -0.3903, -0.0056, ..., 0.5064, -0.0092, 0.0727],
[ 0.2674, -0.4787, 0.2875, ..., 0.0360, -0.0594, 0.1581]],
[[-0.0916, -0.4363, 0.1163, ..., 0.0660, -0.1588, 0.4547],
[ 0.1640, -0.0044, -0.4579, ..., 0.3032, 0.1169, -0.0702],
[ 0.4685, 0.0394, -0.4379, ..., 0.4819, -0.1902, -0.0041],
...,
[ 0.3577, -0.3736, -0.0966, ..., 0.8320, -0.4618, 0.1965],
[ 0.1170, -0.1815, 0.2635, ..., 0.5085, -0.2808, -0.4104],
[ 0.3280, -0.5285, 0.1593, ..., 0.2132, 0.0845, 0.0798]],
...,
[[ 0.2072, -0.0595, -0.0251, ..., 0.2802, 0.1443, -0.1914],
[-0.0046, 0.0999, 0.1122, ..., 0.7022, -0.1804, 0.1607],
[ 0.5502, 0.0381, -0.1655, ..., 0.6034, -0.2458, -0.4194],
...,
[ 0.0174, -0.0653, 0.1226, ..., -0.0778, 0.4626, -0.2119],
[-0.0820, -0.0673, 0.2681, ..., 0.1209, 0.1238, -0.0897],
[ 0.2012, -0.4347, 0.0644, ..., -0.0172, 0.2439, 0.0353]],
[[ 0.3607, 0.3201, -0.0406, ..., 0.4080, -0.1780, 0.1962],
[ 0.2908, -0.1889, -0.1842, ..., 0.4090, 0.2111, 0.0780],
[ 0.1236, 0.0850, 0.3388, ..., 0.6867, -0.1840, 0.2038],
...,
[ 0.1687, 0.2836, 0.1261, ..., 0.6187, -0.6404, -0.8350],
[ 0.0223, -0.4810, -0.5796, ..., -0.0293, -0.2226, -0.7337],
[ 0.0224, -0.4832, 0.0267, ..., -0.3066, -0.1527, 0.5090]],
[[ 0.5192, 0.1122, 0.2916, ..., 0.4464, -0.2356, 0.2271],
[ 0.5739, 0.2733, 0.1988, ..., 0.4276, 0.0144, 0.1697],
[ 0.4333, 0.0017, 0.1719, ..., 0.4272, -0.3533, -0.1419],
...,
[-0.4097, -0.8829, -0.0912, ..., 0.1967, -0.6477, -0.1908],
[-0.0665, -0.4754, -0.2442, ..., 0.2008, -0.0781, -0.2615],
[-0.1611, -0.5038, 0.0295, ..., 0.0230, -0.2014, -0.1446]]],
grad_fn=<SliceBackward0>)
Batch 2:
Bounding box predictions shape: tensor([[[0.5421, 0.5015, 0.4441, 0.4151],
[0.5876, 0.4624, 0.4729, 0.5031],
[0.5161, 0.4403, 0.4073, 0.4686],
...,
[0.5953, 0.4148, 0.4838, 0.3872],
[0.4039, 0.6050, 0.4978, 0.3613],
[0.5355, 0.5035, 0.3781, 0.3715]],
[[0.5702, 0.5047, 0.4579, 0.4599],
[0.6097, 0.4309, 0.4818, 0.6837],
[0.4632, 0.2886, 0.3684, 0.7126],
...,
[0.6064, 0.6285, 0.3851, 0.7249],
[0.6236, 0.6220, 0.4145, 0.4994],
[0.4849, 0.5076, 0.3359, 0.3964]],
[[0.5086, 0.4889, 0.4179, 0.3674],
[0.5550, 0.5354, 0.4949, 0.3966],
[0.5124, 0.4879, 0.3551, 0.4912],
...,
[0.5029, 0.4700, 0.5371, 0.3926],
[0.4874, 0.4419, 0.5231, 0.3494],
[0.4294, 0.4799, 0.4070, 0.4272]],
...,
[[0.5829, 0.4544, 0.4945, 0.3441],
[0.5449, 0.4602, 0.3704, 0.5135],
[0.5799, 0.5946, 0.2727, 0.5179],
...,
[0.6364, 0.6852, 0.4637, 0.4856],
[0.5717, 0.5234, 0.4688, 0.3492],
[0.4586, 0.4468, 0.4053, 0.4276]],
[[0.4841, 0.5341, 0.4181, 0.4564],
[0.5324, 0.4979, 0.4530, 0.4497],
[0.4664, 0.3921, 0.4048, 0.4421],
...,
[0.4912, 0.5531, 0.4774, 0.3730],
[0.5389, 0.4623, 0.4526, 0.4192],
[0.4709, 0.4499, 0.4618, 0.4511]],
[[0.5372, 0.5256, 0.3662, 0.4631],
[0.5944, 0.4660, 0.3277, 0.4507],
[0.5204, 0.4204, 0.3980, 0.4292],
...,
[0.5228, 0.3457, 0.4731, 0.4158],
[0.5656, 0.5219, 0.4509, 0.4094],
[0.5407, 0.4224, 0.4495, 0.4577]]], grad_fn=<SigmoidBackward0>)
Class predictions shape: tensor([[[-0.0606, -0.0272, 0.2072, ..., 0.4931, 0.1349, -0.0423],
[ 0.2339, -0.2714, 0.0835, ..., 0.4049, 0.1895, 0.1290],
[ 0.4997, -0.0561, 0.1738, ..., 0.3218, -0.2822, 0.1928],
...,
[ 0.3929, -0.4195, 0.0524, ..., -0.2508, -0.3643, 0.0899],
[ 0.4513, -0.0925, -0.0688, ..., -0.0693, 0.2799, 0.0602],
[ 0.1498, -0.4824, 0.2054, ..., 0.0524, -0.1789, 0.1263]],
[[ 0.2849, 0.3244, 0.1136, ..., 0.6405, -0.7135, -0.3197],
[ 0.3205, 0.4069, 0.5674, ..., 0.6108, -0.2628, -0.2119],
[ 0.7466, 1.1694, 0.0910, ..., 1.5773, 0.9251, -0.5501],
...,
[ 0.1473, 0.0970, 0.9200, ..., 0.3230, 0.1760, -0.3974],
[ 0.2046, -0.2087, -0.1784, ..., -0.4216, -0.5602, 0.0269],
[ 0.1374, -0.7495, -0.0980, ..., 0.2219, -0.0393, 0.2320]],
[[ 0.2072, -0.0215, 0.0271, ..., 0.2173, -0.2548, -0.0451],
[ 0.0558, 0.0119, 0.2038, ..., 0.4068, -0.1411, 0.0795],
[ 0.0042, -0.1433, -0.1203, ..., 0.4249, -0.1613, 0.0539],
...,
[ 0.2854, -0.0949, -0.2011, ..., -0.0769, 0.0523, -0.0056],
[-0.0580, -0.2639, -0.2910, ..., -0.1965, 0.2757, -0.0230],
[-0.1620, -0.5911, -0.0737, ..., 0.1569, 0.1589, 0.1363]],
...,
[[ 0.0945, -0.2104, 0.3257, ..., -0.0567, -0.1479, -0.0985],
[ 0.5794, 0.0770, 0.1201, ..., 0.3927, -0.1623, -0.2980],
[ 0.0314, -0.0561, -0.0037, ..., 0.4540, -0.6291, 0.1191],
...,
[ 0.5032, -0.2737, -0.1194, ..., -0.2429, -0.2223, -0.4911],
[ 0.4736, -0.3140, -0.2390, ..., -0.1732, -0.0337, 0.1639],
[ 0.0755, -0.2624, -0.2944, ..., -0.0393, 0.0832, -0.0697]],
[[ 0.1329, -0.0335, -0.0591, ..., 0.4464, -0.1904, 0.3879],
[-0.1057, -0.3111, 0.3502, ..., 0.3329, -0.0411, -0.1752],
[-0.1966, -0.1304, 0.0329, ..., 0.4044, -0.0724, 0.0322],
...,
[-0.1304, -0.4384, -0.1595, ..., 0.1827, 0.3148, -0.0365],
[ 0.0251, -0.3613, 0.1422, ..., 0.2249, -0.3405, -0.2777],
[ 0.1301, -0.3983, 0.4183, ..., 0.0374, -0.0270, -0.1966]],
[[ 0.2546, -0.2140, 0.0233, ..., 0.3705, -0.0974, -0.0141],
[ 0.3118, 0.0734, 0.2313, ..., 0.8452, -0.0307, 0.1440],
[ 0.0378, 0.2698, -0.1633, ..., 0.2725, -0.0354, -0.1221],
...,
[-0.3367, -0.4188, -0.3620, ..., 0.2767, 0.2166, -0.0323],
[ 0.0450, -0.1998, 0.0226, ..., -0.3796, -0.0701, 0.3741],
[ 0.2630, -0.5219, -0.1730, ..., 0.3045, 0.1246, -0.1886]]],
grad_fn=<SliceBackward0>)
I haven’t normalized the predicted coordinates to pixel coordinates, hence it is from -1 to 1.
Now, like a loser that I am, I won’t demonstrate my total piece of code, because its absolute trash, so many functions, and have dead ends, but the part that definitely know that is not working, is my _run_apoch_async
.
async def _run_epoch_async(self, data_loader: DataLoader, mode: Literal["training", "validation"] = "training") -> float:
# Set the model to training or evaluation mode based on the provided mode
self.module.train() if mode == "training" else self.module.eval()
# Initialize the total loss for the epoch
total_loss: float = 0.0
# Initialize a progress bar with asynchronous capabilities
async with Bar(iterations=len(data_loader), title=mode.capitalize(), steps=20) as bar:
start_time: float = asyncio.get_event_loop().time()
# Iterate over the DataLoader, which provides batches of images and targets
for progress, (images, targets) in enumerate(data_loader, start=1):
# Ensure that the images are in the correct format (torch.Tensor)
if not isinstance(images, torch.Tensor):
logging.warning("Images must be a torch tensor.")
continue
# Move images and targets to the device (e.g., GPU)
images = images.to(self.device)
targets: List[Dict[str, torch.Tensor]] = [{k: v.to(self.device) for k, v in t.items()} for t in
targets]
# Reset the gradients of the optimizer
self.optimizer.zero_grad()
# Enable gradient calculation only if in training mode
with torch.set_grad_enabled(mode == "training"):
# Forward pass through the model
bbox_pred, class_scores = self.module(images)
# Scale bounding boxes to original image size (avoid in-place operations)
bbox_pred = torch.stack([
bbox_pred[:, :, 0] * self.size[1], # x_min scaled by width
bbox_pred[:, :, 1] * self.size[0], # y_min scaled by height
bbox_pred[:, :, 2] * self.size[1], # x_max scaled by width
bbox_pred[:, :, 3] * self.size[0], # y_max scaled by height
], dim=2)
batch_loss = 0
for i, target in enumerate(targets):
target_boxes = target['boxes'].to(self.device)
target_labels = target['labels'].to(self.device)
num_target_boxes = target_boxes.size(0)
bbox_pred_sample = bbox_pred[i, :num_target_boxes]
class_scores_sample = class_scores[i, :num_target_boxes]
bbox_loss, class_loss = self.reast_loss(
bbox_pred_sample,
target_boxes,
class_scores_sample,
target_labels
)
batch_loss += bbox_loss + class_loss
total_loss += batch_loss.item()
if mode == "training":
batch_loss.backward()
self.optimizer.step()
# Compute the average loss for the epoch
avg_loss: float = np.divide(total_loss, len(data_loader))
logging.info(f"{mode.capitalize()} epoch completed with average loss: {avg_loss:.4f}")
return avg_loss
From what I know, I was expecting my bar (I’ve tested my bar, and it works, its my code), I was supposed to get loss, but for some reason it doesn’t know, maybe its because its None, or 0, which my bar can’t divide by zero.
Training: |####################| 010/010 [100.00%] in 0.0s (1620482.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (652656.0/s, ETA: 0.0s)
Epoch 23/30
Training: |####################| 010/010 [100.00%] in 0.0s (793965.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (830151.0/s, ETA: 0.0s)
Epoch 24/30
Training: |####################| 010/010 [100.00%] in 0.0s (1679543.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (883236.0/s, ETA: 0.0s)
Epoch 25/30
Training: |####################| 010/010 [100.00%] in 0.0s (1769598.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (779787.0/s, ETA: 0.0s)
Epoch 26/30
Training: |####################| 010/010 [100.00%] in 0.0s (1629195.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (733352.0/s, ETA: 0.0s)
Epoch 27/30
Training: |####################| 010/010 [100.00%] in 0.0s (1589319.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (845737.0/s, ETA: 0.0s)
Epoch 28/30
Training: |####################| 010/010 [100.00%] in 0.0s (1379310.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (822233.0/s, ETA: 0.0s)
Epoch 29/30
Training: |####################| 010/010 [100.00%] in 0.0s (1709986.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (1006036.0/s, ETA: 0.0s)
Epoch 30/30
Training: |####################| 010/010 [100.00%] in 0.0s (1642845.0/s, ETA: 0.0s)
Validation: |####################| 005/005 [100.00%] in 0.0s (756086.0/s, ETA: 0.0s)
Model saved at: recognition_model.pt