This snippet tries to take gradient wrt the input to a trained model
import torch
import torch.nn as nn
DATASET_SIZE = 10
TUNING_EPOCH = 10
RANDOMIZATION = "rand"
shape = (10000,)
bsize = 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input = eval(f"torch.{RANDOMIZATION}")(bsize, *shape).to(device).requires_grad_()
numel = input.numel()
model = nn.Sequential(
nn.Flatten(),
nn.Linear(numel, numel//2),
nn.Tanh(),
nn.Linear(numel//2, 1)
).to(device)
model_optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.Adam([input])
dataset = [(eval(f"torch.{RANDOMIZATION}_like")(input), (torch.rand(bsize,1)+8).to(device)) for i in range(DATASET_SIZE)]
model.train()
for e in range(TUNING_EPOCH):
for x, y in dataset:
model_optimizer.zero_grad()
output = model(x)
loss = torch.nn.MSELoss()(output, y)
loss.backward(inputs=list(model.parameters()))
model_optimizer.step()
model.eval()
optimizer.zero_grad()
output = model(input)
output.backward(inputs=input)
print("ninput.grad", input.grad)
The gradient input.grad
, however, is all 0. There are 3 individual fixes
- set
TUNING_EPOCH = 0
, i.e. not training the model - set
RANDOMIZATION = "randn"
, i.e. a different input distribution - set
shape = (1000,)
, making the input smaller
Why would these three changes individually fix the zero gradient issue?