I have a module defined as follows:
class MoR(nn.Module):
def __init__(self, work=False):
super().__init__()
self.sparsity = 0
self.scores = nn.Parameter(torch.tensor(-4.0))
self.f = torch.sigmoid
self.threshold = self.f(self.scores)
self.min_loss = float('inf')
self.sparsity_best = 0 # used for training
self.sparsity_loss = 0
self.recon_loss = 0
self.work = work
self.num = 0
self.sparsity_avg = 0
def forward(self, inputs: torch.Tensor):
if self.work:
# get mask
mask = self.generate_mask(inputs)
if torch.sum(torch.isnan(inputs)).bool():
import pdb
pdb.set_trace()
self.get_sparsity(inputs)
self.update_sparsity_avg()
print("mask:", mask.requires_grad)
print("scores:", self.scores.requires_grad)
print("sparsity:", self.sparsity.requires_grad)
print("threshold:", self.threshold.requires_grad)
return inputs * mask
else:
return inputs
In this module, scores has requires_grad=True, but after computation, requires_grad for other parameters is False. Even the simplest threshold with requires_grad is also False. I don’t understand why this is the case.
The threshold is computed as sigmoid(scores), which should definitely be differentiable.
This situation occurs when I insert this module into an LLM (Large Language Model).
However, when I apply this module to a small model, all training is normal.
Here is the small model:
class LinearModel(nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.mor1 = MoR(work=True)
self.relu1 = nn.ReLU() # Add ReLU activation layer
self.linear1 = nn.Linear(1, 1) # A simple linear layer
self.mor2 = MoR(work=True)
self.relu2 = nn.ReLU() # Add ReLU activation layer
self.linear2 = nn.Linear(1, 1)
def forward(self, x):
x = self.mor1(x)
x = self.linear1(x)
x = self.relu1(x) # Apply ReLU activation
x = self.mor2(x)
x = self.linear2(x)
x = self.relu2(x) # Apply ReLU activation
return x
It’s very strange. I don’t understand if there’s some underlying mechanism that automatically sets requires_grad to False for its outputs.
Note that I haven’t found an answer in GPT or online forums, and possibilities such as whether no_grad() context is used, computation graph breaks, device transfers, or data usage are all non-existent, as shown in the code, it’s just a sigmoid operation.
print("mask:", mask.requires_grad)
print("scores:", self.scores.requires_grad)
print("sparsity:", self.sparsity.requires_grad)
print("threshold:", self.threshold.requires_grad)
the parameter’s requires_grad is True
user25361577 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.