Generating process:
1x = torch.randn(4, 3, 256, 256).to(device)2for i, t in tqdm(enumerate(scheduler.timesteps)):3 model_input = scheduler.scale_model_input(x, t)4 with torch.no_grad():5 noise_pred = image_pipe.unet(model_input, t)["sample"]6 x = scheduler.step(noise_pred, t, sample=x).prev_sample
Guidance
1x = torch.randn(4, 3, 256, 256).to(device)2for i, t in tqdm(enumerate(scheduler.timesteps)):3 x = x.detach().requires_grad_()4 model_input = scheduler.scale_model_input(x, t)5 noise_pred = image_pipe.unet(model_input, t)["sample"]6
7 x0 = scheduler.step(noise_pred, t, x).pred_original_sample8 loss = <custom_loss>(x0) * <guidance_loss_scale>9 cond_grad = -torch.autograd.grad(loss, x)[0]10 x = x.detach() + cond_grad11
12 x = scheduler.step(noise_pred, t, x).prev_sample
CLIP Guidance
1with torch.no_grad():2 text_features = clip_model.encode_text(text)3
4for i, t in tqdm(enumerate(scheduler.timesteps)):5 # print(i, t) # (1, tensor(1000)), (2, tensor(980))...6 model_input = scheduler.scale_model_input(x, t) # DDIM loaded7 with torch.no_grad():8 # image_pipe is loaded by the same name9 noise_pred = image_pipe.unet(model_input, t)["sample"]10 cond_grad = 011 for cut in range(n_cuts):12 x = x.detach().requires_grad_()13 x0 = scheduler.step(noise_pred,t, sample=x).pred_original_sample14 loss = <clip_loss>(x0, text_features) * guidance_scale15 cond_grad -= torch.autograd.grad(loss, x)[0] / n_cuts9 collapsed lines
16
17 if i % 25 == 0:18 print(f"Steps {i} loss: {loss.item()}")19
20 alpha_bar = scheduler.alphas_cumprod[i]21 # `alpha_bar` here is decreasing and works for textures.22 # Can be changed to some increasing coefficients!23 x = x.detach() + cond_grad * alpha_bar.sqrt()24 x = scheduler.step(noise_pred, t, x).prev_sample