unit2-01_finetuning_and_guidance

Generating process:

1
x = torch.randn(4, 3, 256, 256).to(device)
2
for i, t in tqdm(enumerate(scheduler.timesteps)):
3
    model_input = scheduler.scale_model_input(x, t)
4
    with torch.no_grad():
5
        noise_pred = image_pipe.unet(model_input, t)["sample"]
6
    x = scheduler.step(noise_pred, t, sample=x).prev_sample

Guidance

1
x = torch.randn(4, 3, 256, 256).to(device)
2
for i, t in tqdm(enumerate(scheduler.timesteps)):
3
    x = x.detach().requires_grad_()
4
    model_input = scheduler.scale_model_input(x, t)
5
    noise_pred = image_pipe.unet(model_input, t)["sample"]
6

7
    x0 = scheduler.step(noise_pred, t, x).pred_original_sample
8
    loss = <custom_loss>(x0) * <guidance_loss_scale>
9
    cond_grad = -torch.autograd.grad(loss, x)[0]
10
    x = x.detach() + cond_grad
11

12
    x = scheduler.step(noise_pred, t, x).prev_sample

CLIP Guidance

1
with torch.no_grad():
2
    text_features = clip_model.encode_text(text)
3

4
for i, t in tqdm(enumerate(scheduler.timesteps)):
5
    # print(i, t) # (1, tensor(1000)), (2, tensor(980))...
6
    model_input = scheduler.scale_model_input(x, t) # DDIM loaded
7
    with torch.no_grad():
8
        # image_pipe is loaded by the same name
9
        noise_pred = image_pipe.unet(model_input, t)["sample"]
10
    cond_grad = 0
11
    for cut in range(n_cuts):
12
        x = x.detach().requires_grad_()
13
        x0  = scheduler.step(noise_pred,t, sample=x).pred_original_sample
14
        loss = <clip_loss>(x0, text_features) * guidance_scale
15
        cond_grad -= torch.autograd.grad(loss, x)[0] / n_cuts
9 collapsed lines
16

17
    if i % 25 == 0:
18
        print(f"Steps {i} loss: {loss.item()}")
19

20
    alpha_bar = scheduler.alphas_cumprod[i]
21
    # `alpha_bar` here is decreasing and works for textures.
22
    # Can be changed to some increasing coefficients!
23
    x = x.detach() + cond_grad * alpha_bar.sqrt()
24
    x = scheduler.step(noise_pred, t, x).prev_sample