I’m trying to build a framework that uses Stable Diffusion as a backbone. My goal is to start from an initial latent embedding and optimize it based on what is generated by Stable Diffusion. However, it seems that the gradients are not being updated properly. Although I can see that the results from Stable Diffusion have gradients and the loss is not zero, the embeddings I’m trying to optimize remain unchanged.
This is the code :
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
diffusion_pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(device)
img2img_pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(device)
preprocess = transforms.Compose([
transforms.ToTensor()
])
def optimize_text_embedding(input_image, target_text, num_steps=10):
# Freeze the diffusion model weights
for param in diffusion_pipeline.unet.parameters():
param.requires_grad = False
# Encode target text
text_inputs = tokenizer(target_text, return_tensors="pt").to(device)
target_embedding = text_model(**text_inputs).last_hidden_state.requires_grad_(True)
# Initialize the optimized embedding as the target embedding
optimized_embedding = nn.Parameter(target_embedding.clone().detach().to(device))
optimizer = torch.optim.Adam([optimized_embedding], lr=1e-3)
# Convert input image to tensor
input_image_tensor = preprocess(input_image).unsqueeze(0).to(device)
for step in range(num_steps):
optimizer.zero_grad()
# Generate image using optimized embedding
generated_images = diffusion_pipeline(prompt_embeds=optimized_embedding, init_image=input_image,
output_type="latent").images
# Decode the generated latent image to get the final image
generated_image =
diffusion_pipeline.vae.decode(generated_images / diffusion_pipeline.vae.config.scaling_factor)[0]
generated_image_tensor = generated_image.unsqueeze(0).to(device)
# Compute loss between generated image and input image
loss = ((generated_image_tensor - input_image_tensor) ** 2).mean()
print(f"Loss: {loss.item()}")
# Backpropagate the loss
loss.backward()
# Update the embeddings
optimizer.step()
# Print gradients for debugging
print("Optimized embedding gradients: ", optimized_embedding.grad)
print("Optimized embedding: ", optimized_embedding)
# Unfreeze the diffusion model weights after optimization
for param in diffusion_pipeline.unet.parameters():
param.requires_grad = True
return optimized_embedding
input_image_path = "dogs/n02085936_233.jpg"
target_text = "A standing dog"
input_image = Image.open(input_image_path).convert("RGB")
input_image = input_image.resize((512, 512), Image.BICUBIC)
input_image.save("input_image.png")
optimized_embedding = optimize_text_embedding(input_image, target_text)
Any thoughts ?
I’ve tried multiple things setting the gradients, not use it as a parameter , and etc, nothing worked
Tzlil lev or is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.