I am currently training Faster R-CNN algorithm with Mobilenetv2 as backbone. My own dataset is used, which approximately 9k images inlcuding train and valid set.
Initially, the training process goes smoothly, but one epoch takes 30 minutes, which is slow as I need to train with 200 epochs. After a few epochs, I found that the memory usage kept increasing in every epoch and finally Pycharm crashed (I checked the memory usage in Task Manager). I was wondering if there was some problem in the code like dataloader.
The information below is my device specifications and parameters used in training:
GPU: RTX3050ti
RAM: 32GB
Windows: 10
Epochs: 200
Num_workers: 8
Batch: 16
Step_size: 50
Optimizer: SGD
And this is the python libraries version I installed:
Python : 3.7
torch: 1.7.1+cu110
torchvision: 0.8.2+cu110
torchaudio: 0.7.2
Below is the train.py of the Faster R-CNN used:
import os
import datetime
import torch
import torchvision
import transforms
# from memory_profiler import profile, memory_usage
from network_files import FasterRCNN, AnchorsGenerator
from backbone import MobileNetV2, vgg
from my_dataset import VOCDataSet
from train_utils import GroupedBatchSampler, create_aspect_ratio_groups
from train_utils import train_eval_utils as utils
def create_model(num_classes):
# Uncomment to use VGG16 as the backbone
# vgg_feature = vgg(model_name="vgg16", weights_path="./backbone/vgg16.pth").features
# backbone = torch.nn.Sequential(*list(vgg_feature._modules.values())[:-1]) # Remove the last Maxpool layer
# backbone.out_channels = 512
# Using MobileNetV2 as the backbone
backbone = MobileNetV2(weights_path="./backbone/mobilenet_v2.pth").features
backbone.out_channels = 1280 # Set the output channels of the backbone
anchor_generator = AnchorsGenerator(sizes=((32, 64, 128, 256, 512),),
aspect_ratios=((0.5, 1.0, 2.0),))
roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'], # Feature layers for ROI pooling
output_size=[7, 7], # Output size of ROI pooling
sampling_ratio=2) # Sampling ratio
model = FasterRCNN(backbone=backbone,
num_classes=num_classes,
rpn_anchor_generator=anchor_generator,
box_roi_pool=roi_pooler)
return model
def main():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using {} device training.".format(device.type))
# File to save COCO evaluation results
results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
# Check if save_weights directory exists, create if not
if not os.path.exists("save_weights"):
os.makedirs("save_weights")
data_transform = {
"train": transforms.Compose([transforms.ToTensor()]),
"val": transforms.Compose([transforms.ToTensor()])
}
VOC_root = "./" # Root directory for VOC dataset
aspect_ratio_group_factor = 3
batch_size = 16
amp = False # Use automatic mixed precision training
# Check if VOC root directory exists
if not os.path.exists(os.path.join(VOC_root, "VOCdevkit")):
raise FileNotFoundError("VOCdevkit does not exist in path:'{}'.".format(VOC_root))
# Load training dataset
train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt")
train_sampler = None
# Use aspect ratio grouping to reduce memory usage
if aspect_ratio_group_factor >= 0:
train_sampler = torch.utils.data.RandomSampler(train_dataset)
group_ids = create_aspect_ratio_groups(train_dataset, k=aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size)
else:
train_sampler = torch.utils.data.RandomSampler(train_dataset)
num_workers = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # Number of workers
print('Using %g dataloader workers' % num_workers)
if train_sampler:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
batch_sampler=train_batch_sampler,
pin_memory=True,
num_workers=num_workers,
collate_fn=train_dataset.collate_fn)
else:
train_data_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size,
shuffle=False,
pin_memory=True,
num_workers=num_workers,
collate_fn=train_dataset.collate_fn)
# Load validation dataset
val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt")
val_data_loader = torch.utils.data.DataLoader(val_dataset,
batch_size=16,
shuffle=False,
pin_memory=True,
num_workers=num_workers,
collate_fn=val_dataset.collate_fn)
# Create model
model = create_model(num_classes=5)
model.to(device)
scaler = torch.cuda.amp.GradScaler() if amp else None
train_loss = []
learning_rate = []
val_map = []
# Freeze backbone and train the RPN and prediction head
for param in model.backbone.parameters():
param.requires_grad = False
optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad],
lr=0.005, momentum=0.9, weight_decay=0.0005)
init_epochs = 5
for epoch in range(init_epochs):
mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
device, epoch, print_freq=50,
warmup=True, scaler=scaler)
train_loss.append(mean_loss.item())
learning_rate.append(lr)
coco_info = utils.evaluate(model, val_data_loader, device=device)
with open(results_file, "a") as f:
result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
txt = "epoch:{} {}".format(epoch, ' '.join(result_info))
f.write(txt + "n")
val_map.append(coco_info[1])
torch.save(model.state_dict(), "./save_weights/pretrain.pth")
# Unfreeze backbone and train the entire network
for name, parameter in model.backbone.named_parameters():
split_name = name.split(".")[0]
if split_name in ["0", "1", "2", "3"]:
parameter.requires_grad = False
else:
parameter.requires_grad = True
optimizer = torch.optim.SGD([p for p in model.parameters() if p.requires_grad],
lr=0.005, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.33)
num_epochs = 200
for epoch in range(init_epochs, init_epochs + num_epochs, 1):
mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader,
device, epoch, print_freq=100,
warmup=True, scaler=scaler)
train_loss.append(mean_loss.item())
learning_rate.append(lr)
lr_scheduler.step()
coco_info = utils.evaluate(model, val_data_loader, device=device)
with open(results_file, "a") as f:
result_info = [f"{i:.4f}" for i in coco_info + [mean_loss.item()]] + [f"{lr:.6f}"]
txt = "epoch:{} {}".format(epoch, ' '.join(result_info))
f.write(txt + "n")
val_map.append(coco_info[1])
if epoch >= (init_epochs + num_epochs - 5):
save_files = {
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch}
torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch))
# Plot loss and learning rate curve
if len(train_loss) != 0 and len(learning_rate) != 0:
from plot_curve import plot_loss_and_lr
plot_loss_and_lr(train_loss, learning_rate)
# Plot mAP curve
if len(val_map) != 0:
from plot_curve import plot_map
plot_map(val_map)
if __name__ == "__main__":
main()
Finally, this is the first epoch I get during training, it takes 30 minutes to finish even the cuda is used for training.
Using cuda device training.
Using [0, 0.5, 0.6299605249474366, 0.7937005259840997, 1.0, 1.2599210498948732, 1.5874010519681994, 2.0, inf] as bins for aspect ratio quantization
Count of instances per bin: [7888]
Using 8 dataloader workers
Epoch: [0] [ 0/493] eta: 3:10:50.930041 lr: 0.000015 loss: 2.3496 (2.3496) loss_classifier: 1.6119 (1.6119) loss_box_reg: 0.0329 (0.0329) loss_objectness: 0.6825 (0.6825) loss_rpn_box_reg: 0.0223 (0.0223) time: 23.2270 data: 9.6419 max mem: 2793
Epoch: [0] [ 50/493] eta: 0:29:50.711795 lr: 0.000523 loss: 0.4996 (1.1243) loss_classifier: 0.1341 (0.5352) loss_box_reg: 0.0311 (0.0369) loss_objectness: 0.2995 (0.5273) loss_rpn_box_reg: 0.0349 (0.0248) time: 3.6610 data: 0.0000 max mem: 3401
Epoch: [0] [100/493] eta: 0:25:17.636046 lr: 0.001030 loss: 0.2707 (0.7559) loss_classifier: 0.0866 (0.3344) loss_box_reg: 0.0200 (0.0356) loss_objectness: 0.1424 (0.3629) loss_rpn_box_reg: 0.0216 (0.0231) time: 3.6545 data: 0.0000 max mem: 3401
Epoch: [0] [150/493] eta: 0:21:41.925717 lr: 0.001538 loss: 0.4117 (0.6141) loss_classifier: 0.1599 (0.2656) loss_box_reg: 0.0621 (0.0357) loss_objectness: 0.1666 (0.2903) loss_rpn_box_reg: 0.0232 (0.0225) time: 3.6502 data: 0.0001 max mem: 3401
Epoch: [0] [200/493] eta: 0:18:23.093898 lr: 0.002046 loss: 0.2504 (0.5403) loss_classifier: 0.0935 (0.2300) loss_box_reg: 0.0295 (0.0369) loss_objectness: 0.1117 (0.2515) loss_rpn_box_reg: 0.0158 (0.0219) time: 3.6510 data: 0.0002 max mem: 3401
Epoch: [0] [250/493] eta: 0:15:09.790380 lr: 0.002553 loss: 0.4560 (0.4929) loss_classifier: 0.1365 (0.2058) loss_box_reg: 0.0711 (0.0380) loss_objectness: 0.2080 (0.2275) loss_rpn_box_reg: 0.0403 (0.0216) time: 3.6640 data: 0.0000 max mem: 3401
Epoch: [0] [300/493] eta: 0:12:01.483949 lr: 0.003061 loss: 0.3157 (0.4603) loss_classifier: 0.0782 (0.1881) loss_box_reg: 0.0362 (0.0400) loss_objectness: 0.1764 (0.2110) loss_rpn_box_reg: 0.0249 (0.0212) time: 3.7164 data: 0.0001 max mem: 3401
Epoch: [0] [350/493] eta: 0:08:55.447698 lr: 0.003569 loss: 0.2501 (0.4348) loss_classifier: 0.0756 (0.1733) loss_box_reg: 0.0413 (0.0411) loss_objectness: 0.1147 (0.1992) loss_rpn_box_reg: 0.0185 (0.0212) time: 3.7268 data: 0.0001 max mem: 3401
Epoch: [0] [400/493] eta: 0:05:48.394548 lr: 0.004076 loss: 0.2758 (0.4132) loss_classifier: 0.0661 (0.1615) loss_box_reg: 0.0446 (0.0420) loss_objectness: 0.1335 (0.1889) loss_rpn_box_reg: 0.0315 (0.0208) time: 3.7426 data: 0.0001 max mem: 3401
Epoch: [0] [450/493] eta: 0:02:40.654559 lr: 0.004584 loss: 0.2503 (0.3963) loss_classifier: 0.0699 (0.1520) loss_box_reg: 0.0491 (0.0429) loss_objectness: 0.1125 (0.1808) loss_rpn_box_reg: 0.0188 (0.0206) time: 3.6161 data: 0.0001 max mem: 3401
Epoch: [0] [492/493] eta: 0:00:03.726008 lr: 0.005000 loss: 0.2180 (0.3838) loss_classifier: 0.0727 (0.1454) loss_box_reg: 0.0495 (0.0436) loss_objectness: 0.0839 (0.1744) loss_rpn_box_reg: 0.0118 (0.0203) time: 3.6157 data: 0.0000 max mem: 3401
Epoch: [0] Total time: 0:30:37 (3.7265 s / it)
creating index...
index created!
Test: [ 0/124] eta: 0:21:36.053664 model_time: 3.1140 (3.1140) evaluator_time: 0.0030 (0.0030) time: 10.4520 data: 7.2820 max mem: 3401
Test: [100/124] eta: 0:00:52.285159 model_time: 0.7686 (2.0934) evaluator_time: 0.0070 (0.0057) time: 1.8716 data: 0.0001 max mem: 3401
Test: [123/124] eta: 0:00:02.106653 model_time: 0.2362 (2.0335) evaluator_time: 0.0029 (0.0076) time: 1.8302 data: 0.0000 max mem: 3401
Test: Total time: 0:04:21 (2.1087 s / it)
Averaged stats: model_time: 0.2362 (2.0335) evaluator_time: 0.0029 (0.0076)
Accumulating evaluation results...
DONE (t=0.48s).
IoU metric: bbox
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.012
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.046
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.002
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.014
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.033
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.109
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.119
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.003
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.131
Epoch: [1] [ 0/493] eta: 1:42:49.210967 lr: 0.005000 loss: 0.2061 (0.2061) loss_classifier: 0.0567 (0.0567) loss_box_reg: 0.0293 (0.0293) loss_objectness: 0.1100 (0.1100) loss_rpn_box_reg: 0.0100 (0.0100) time: 12.5136 data: 8.2839 max mem: 3401
I checked the train.py and it just train on my dataset normally.
I also tried to invalidate caches and restart the Pycharm several times to run the code, the problem still exist which are slow training problem and Memory usage gradually increase per epoch. I wish to know whether there is the problem in the code.