I am building a custom model for instance segmentation task that takes in 2D images, and a target for training. I think there is an issue with the model arch but not sure where it is. I have tried many things and used diff ways to extract features but stock on similar errors like above.
I’m new to deep learning and not sure what is going on here, but I find something suspicious about the output numbers and thus getting this error. Any help is appreciated. Thanks in advance!
Module
class CustomModel(nn.Module):
def __init__(self):
super().__init__()
self.backbone = backbone_utils.resnet_fpn_backbone(
backbone_name='resnet50',
weights=models.ResNet50_Weights.DEFAULT
)
self.rpn_layer = rpn.RegionProposalNetwork(
anchor_generator = anchor_utils.AnchorGenerator(
sizes = ((6, 9, 16), (6, 9, 16), (6, 9, 16), (6, 9, 16), (6, 9, 16)),
aspect_ratios = (
(1.0, 1.25, 1.5),
(1.0, 1.25, 1.5),
(1.0, 1.25, 1.5),
(1.0, 1.25, 1.5),
(1.0, 1.25, 1.5)
)
),
batch_size_per_image=4,
bg_iou_thresh=0.3,
fg_iou_thresh=0.7,
head=rpn.RPNHead(in_channels=256, num_anchors=9),
nms_thresh=0.7,
positive_fraction=0.5,
post_nms_top_n={'training': 2000, 'testing': 300},
pre_nms_top_n={'training': 12000, 'testing': 6000},
score_thresh=0.0
)
self.roi_align = ops.MultiScaleRoIAlign(
output_size=7,
sampling_ratio=2,
featmap_names=['0', '1', '2', '3'],
canonical_scale=224,
canonical_level=4
)
self.mask_head = mask_rcnn.MaskRCNNPredictor(
in_channels=256,
dim_reduced=10,
num_classes=10
)
self.bbox3d = Bbox3DPredictor(
features=256,
bins=2,
w=0.4
)
def forward(self, x, targets):
backbone_features = self.backbone(x)
print(backbone_features)
for i, feature in backbone_features.items():
print(f"Backbone features shapes", feature.shape)
img_list = image_list.ImageList(x, [img.shape[-2:] for img in x])
proposals, proposals_losses = self.rpn_layer(
img_list,
backbone_features,
targets
)
for proposal in proposals:
print(f"Proposals shapes", proposal.shape)
image_shapes = img_list.image_sizes
roi_features = self.roi_align(
backbone_features,
proposals,
image_shapes
)
print(f"ROI Align shape: {roi_features.shape}")
if self.training:
losses = {}
for target in targets:
masks = target["mask"]
bbox3ds = target["boxes"]
pcs = target["pc"]
mask_logits = self.mask_head(roi_features)
mask_loss = fnc.binary_cross_entropy_with_logits(mask_logits, masks)
losses['mask_loss'] = mask_loss
print(f"Mask logits: {mask_logits}")
Output
Backbone features shapes torch.Size([4, 256, 64, 64])
Backbone features shapes torch.Size([4, 256, 32, 32])
Backbone features shapes torch.Size([4, 256, 16, 16])
Backbone features shapes torch.Size([4, 256, 8, 8])
Backbone features shapes torch.Size([4, 256, 4, 4])
Proposals shapes torch.Size([2000, 4])
Proposals shapes torch.Size([2000, 4])
Proposals shapes torch.Size([2000, 4])
Proposals shapes torch.Size([2000, 4])
ROI Align shape: torch.Size([8000, 256, 7, 7])
Corrected dim_reduced in MaskRCNNPredictor: Ensured it matches the number of feature channels output by the ROI Align layer.
Proper Mask Loss Calculation: Made sure mask_logits and masks are aligned correctly during loss computation.
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.ops as ops
import torchvision.models.detection.backbone_utils as backbone_utils
import torchvision.models.detection.rpn as rpn
import torchvision.models.detection.anchor_utils as anchor_utils
import torchvision.models.detection.image_list as image_list
import torchvision.models.detection.mask_rcnn as mask_rcnn
import torch.nn.functional as F
class Bbox3DPredictor(nn.Module):
def __init__(self, features, bins, w):
super().__init__()
# Custom 3D bounding box predictor implementation
def forward(self, x):
# Forward pass implementation
pass
class CustomModel(nn.Module):
def __init__(self):
super().__init__()
self.backbone = backbone_utils.resnet_fpn_backbone(
backbone_name='resnet50',
weights=models.ResNet50_Weights.DEFAULT
)
self.rpn_layer = rpn.RegionProposalNetwork(
anchor_generator=anchor_utils.AnchorGenerator(
sizes=((6, 9, 16),) * 5,
aspect_ratios=((1.0, 1.25, 1.5),) * 5
),
batch_size_per_image=4,
bg_iou_thresh=0.3,
fg_iou_thresh=0.7,
head=rpn.RPNHead(in_channels=256, num_anchors=9),
nms_thresh=0.7,
positive_fraction=0.5,
post_nms_top_n={'training': 2000, 'testing': 300},
pre_nms_top_n={'training': 12000, 'testing': 6000},
score_thresh=0.0
)
self.roi_align = ops.MultiScaleRoIAlign(
output_size=7,
sampling_ratio=2,
featmap_names=['0', '1', '2', '3'],
canonical_scale=224,
canonical_level=4
)
self.mask_head = mask_rcnn.MaskRCNNPredictor(
in_channels=256,
dim_reduced=256, # Corrected dimension
num_classes=10 # Ensure this matches your dataset
)
self.bbox3d = Bbox3DPredictor(
features=256,
bins=2,
w=0.4
)
def forward(self, x, targets=None):
backbone_features = self.backbone(x)
img_list = image_list.ImageList(x, [img.shape[-2:] for img in x])
proposals, proposal_losses = self.rpn_layer(img_list, backbone_features, targets)
image_shapes = img_list.image_sizes
roi_features = self.roi_align(backbone_features, proposals, image_shapes)
mask_logits = self.mask_head(roi_features)
if self.training:
losses = {}
masks = torch.cat([t["masks"] for t in targets])
mask_loss = F.binary_cross_entropy_with_logits(mask_logits, masks)
losses['mask_loss'] = mask_loss
return losses
return proposals, mask_logits
# Usage example
x = torch.rand(4, 3, 224, 224) # Batch of 4 images
targets = [{"masks": torch.rand(2000, 1, 28, 28)} for _ in range(4)] # Dummy target
model = CustomModel()
model.train()
output = model(x, targets)
Mohammad Hashim is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1