I add two new modules to a large multimodal model. Now, I want to train module A with MSE loss and train module B with CrossEntropy. I’ve tried to set automatic_optimization
to False
and process backward manually, but the gradients are always None
(I printed gradients with print(param.grad)
). Besides,when I train in ddp
, I got
RuntimeError: It looks like your LightningModule has parameters that were not used in producing the loss returned by training_step. If this is intentional, you must enable the detection of unused parameters in DDP, either by setting the string value `strategy='ddp_find_unused_parameters_true'` or by setting the flag in the strategy with `strategy=DDPStrategy(find_unused_parameters=True)`.
when I train in deepspeed_stage_2_offload
, I got
UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
This might indicate that the optimizer.step()
is skipped due to empty gradients.
I configure optimizer with following code:
def configure_optimizers(self):
def setup_optimizer(param_dict):
alpha_params = [p for n, p in param_dict.items() if "alpha" in n]
non_alpha_params = [p for n, p in param_dict.items() if not "alpha" in n]
optim_groups = [
{"params": non_alpha_params, "lr": setting.icv_lr},
{"params": alpha_params, "lr": setting.alpha_lr},
]
if "deepspeed" in setting.strategy:
optimizer = DeepSpeedCPUAdam(
optim_groups,
weight_decay=setting.weight_decay,
)
else:
optimizer = optim.Adam(
optim_groups,
weight_decay=setting.weight_decay,
)
step_batches = self.trainer.estimated_stepping_batches
warmup_steps = setting.warmup_step
if isinstance(warmup_steps, float):
warm_steps = warmup_steps * step_batches
elif isinstance(warmup_steps, int):
warm_steps = warmup_steps
else:
raise ValueError(
f"the warm_steps should be int or float, but got {type(warmup_steps)}"
)
scheduler = get_cosine_schedule_with_warmup(
optimizer, num_warmup_steps=warm_steps, num_training_steps=step_batches
)
return {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": scheduler, "interval": "step"},
}
attn_shift_opt = setup_optimizer(self.shift_encoder.attn_shift_params())
if Stratety.ALTERNATE_TRAINING in self.strategy:
ffn_shift_opt = setup_optimizer(self.shift_encoder.ffn_shift_params())
return attn_shift_opt, ffn_shift_opt
return attn_shift_opt
In my train step:
def optimize(opt, sch, loss_dict):
loss_dict = {
k: v / setting.accumulate_grad_batches for k, v in loss_dict
}
self.manual_backward(sum(loss_dict.values()))
if (batch_idx + 1) % setting.accumulate_grad_batches == 0:
self.clip_gradients(
opt,
gradient_clip_val=setting.grad_clip_val,
gradient_clip_algorithm="norm",
)
opt.step()
sch.step()
opt.zero_grad()
return loss_dict
attn_opt, shift_opt = self.optimizers()
attn_sch, shift_sch = self.lr_schedulers()
loss_dict = {
**optimize(
attn_opt,
attn_sch,
self.layer_wise_forward(query_inputs, ice_hidden_states),
),
**optimize(
shift_opt,
shift_sch,
self.task_specific_forward(
query_inputs, ice_logits, ice_label_mask
),
),
}
Please help me!!