第一次完整测例跑完

2026-01-18 00:30:10 +08:00
parent ca15cc593b
commit 25c6fc04db
180 changed files with 29305 additions and 0 deletions
--- a/src/unifolm_wma/models/samplers/ddim.py
+++ b/src/unifolm_wma/models/samplers/ddim.py
@@ -0,0 +1,473 @@
+import numpy as np
+import torch
+import copy
+
+from unifolm_wma.utils.diffusion import make_ddim_sampling_parameters, make_ddim_timesteps, rescale_noise_cfg
+from unifolm_wma.utils.common import noise_like
+from unifolm_wma.utils.common import extract_into_tensor
+from tqdm import tqdm
+
+
+class DDIMSampler(object):
+
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+        self.counter = 0
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+
+    def make_schedule(self,
+                      ddim_num_steps,
+                      ddim_discretize="uniform",
+                      ddim_eta=0.,
+                      verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[
+            0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model
+                                                                     .device)
+
+        if self.model.use_dynamic_rescale:
+            self.ddim_scale_arr = self.model.scale_arr[self.ddim_timesteps]
+            self.ddim_scale_arr_prev = torch.cat(
+                [self.ddim_scale_arr[0:1], self.ddim_scale_arr[:-1]])
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(self.model.alphas_cumprod_prev))
+
+        # Calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # DDIM sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas',
+                             np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) *
+            (1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps',
+                             sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(
+            self,
+            S,
+            batch_size,
+            shape,
+            conditioning=None,
+            callback=None,
+            normals_sequence=None,
+            img_callback=None,
+            quantize_x0=False,
+            eta=0.,
+            mask=None,
+            x0=None,
+            temperature=1.,
+            noise_dropout=0.,
+            score_corrector=None,
+            corrector_kwargs=None,
+            verbose=True,
+            schedule_verbose=False,
+            x_T=None,
+            log_every_t=100,
+            unconditional_guidance_scale=1.,
+            unconditional_conditioning=None,
+            precision=None,
+            fs=None,
+            timestep_spacing='uniform',  #uniform_trailing for starting from last timestep
+            guidance_rescale=0.0,
+            **kwargs):
+
+        # Check condition bs
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                try:
+                    cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                except:
+                    cbs = conditioning[list(
+                        conditioning.keys())[0]][0].shape[0]
+
+                if cbs != batch_size:
+                    print(
+                        f"Warning: Got {cbs} conditionings but batch-size is {batch_size}"
+                    )
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(
+                        f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}"
+                    )
+
+        self.make_schedule(ddim_num_steps=S,
+                           ddim_discretize=timestep_spacing,
+                           ddim_eta=eta,
+                           verbose=schedule_verbose)
+
+        # Make shape
+        if len(shape) == 3:
+            C, H, W = shape
+            size = (batch_size, C, H, W)
+        elif len(shape) == 4:
+            C, T, H, W = shape
+            size = (batch_size, C, T, H, W)
+
+        samples, actions, states, intermediates = self.ddim_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            verbose=verbose,
+            precision=precision,
+            fs=fs,
+            guidance_rescale=guidance_rescale,
+            **kwargs)
+        return samples, actions, states, intermediates
+
+    @torch.no_grad()
+    def ddim_sampling(self,
+                      cond,
+                      shape,
+                      x_T=None,
+                      ddim_use_original_steps=False,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      log_every_t=100,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      verbose=True,
+                      precision=None,
+                      fs=None,
+                      guidance_rescale=0.0,
+                      **kwargs):
+        device = self.model.betas.device
+        dp_ddim_scheduler_action = self.model.dp_noise_scheduler_action
+        dp_ddim_scheduler_state = self.model.dp_noise_scheduler_state
+
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+            action = torch.randn((b, 16, self.model.agent_action_dim),
+                                 device=device)
+            state = torch.randn((b, 16, self.model.agent_state_dim),
+                                device=device)
+        else:
+            img = x_T
+            action = torch.randn((b, 16, self.model.agent_action_dim),
+                                 device=device)
+            state = torch.randn((b, 16, self.model.agent_state_dim),
+                                device=device)
+
+        if precision is not None:
+            if precision == 16:
+                img = img.to(dtype=torch.float16)
+                action = action.to(dtype=torch.float16)
+                state = state.to(dtype=torch.float16)
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(
+                min(timesteps / self.ddim_timesteps.shape[0], 1) *
+                self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {
+            'x_inter': [img],
+            'pred_x0': [img],
+            'x_inter_action': [action],
+            'pred_x0_action': [action],
+            'x_inter_state': [state],
+            'pred_x0_state': [state],
+        }
+        time_range = reversed(range(
+            0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[
+            0]
+        if verbose:
+            iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        else:
+            iterator = time_range
+
+        clean_cond = kwargs.pop("clean_cond", False)
+
+        dp_ddim_scheduler_action.set_timesteps(len(timesteps))
+        dp_ddim_scheduler_state.set_timesteps(len(timesteps))
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b, ), step, device=device, dtype=torch.long)
+
+            # Use mask to blend noised original latent (img_orig) & new sampled latent (img)
+            if mask is not None:
+                assert x0 is not None
+                if clean_cond:
+                    img_orig = x0
+                else:
+                    img_orig = self.model.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_ddim(
+                img,
+                action,
+                state,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                mask=mask,
+                x0=x0,
+                fs=fs,
+                guidance_rescale=guidance_rescale,
+                **kwargs)
+
+            img, pred_x0, model_output_action, model_output_state = outs
+
+            action = dp_ddim_scheduler_action.step(
+                model_output_action,
+                step,
+                action,
+                generator=None,
+            ).prev_sample
+            state = dp_ddim_scheduler_state.step(
+                model_output_state,
+                step,
+                state,
+                generator=None,
+            ).prev_sample
+
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+                intermediates['x_inter_action'].append(action)
+                intermediates['x_inter_state'].append(state)
+
+        return img, action, state, intermediates
+
+    @torch.no_grad()
+    def p_sample_ddim(self,
+                      x,
+                      x_action,
+                      x_state,
+                      c,
+                      t,
+                      index,
+                      repeat_noise=False,
+                      use_original_steps=False,
+                      quantize_denoised=False,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      uc_type=None,
+                      conditional_guidance_scale_temporal=None,
+                      mask=None,
+                      x0=None,
+                      guidance_rescale=0.0,
+                      **kwargs):
+        b, *_, device = *x.shape, x.device
+        if x.dim() == 5:
+            is_video = True
+        else:
+            is_video = False
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            model_output, model_output_action, model_output_state = self.model.apply_model(
+                x, x_action, x_state, t, c, **kwargs)  # unet denoiser
+        else:
+            # do_classifier_free_guidance
+            if isinstance(c, torch.Tensor) or isinstance(c, dict):
+                e_t_cond, e_t_cond_action, e_t_cond_state = self.model.apply_model(
+                    x, x_action, x_state, t, c, **kwargs)
+                e_t_uncond, e_t_uncond_action, e_t_uncond_state = self.model.apply_model(
+                    x, x_action, x_state, t, unconditional_conditioning,
+                    **kwargs)
+            else:
+                raise NotImplementedError
+            model_output = e_t_uncond + unconditional_guidance_scale * (
+                e_t_cond - e_t_uncond)
+            model_output_action = e_t_uncond_action + unconditional_guidance_scale * (
+                e_t_cond_action - e_t_uncond_action)
+            model_output_state = e_t_uncond_state + unconditional_guidance_scale * (
+                e_t_cond_state - e_t_uncond_state)
+
+            if guidance_rescale > 0.0:
+                model_output = rescale_noise_cfg(
+                    model_output, e_t_cond, guidance_rescale=guidance_rescale)
+                model_output_action = rescale_noise_cfg(
+                    model_output_action,
+                    e_t_cond_action,
+                    guidance_rescale=guidance_rescale)
+                model_output_state = rescale_noise_cfg(
+                    model_output_state,
+                    e_t_cond_state,
+                    guidance_rescale=guidance_rescale)
+
+        if self.model.parameterization == "v":
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
+
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps", 'not implemented'
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c,
+                                               **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+
+        if is_video:
+            size = (b, 1, 1, 1, 1)
+        else:
+            size = (b, 1, 1, 1)
+
+        a_t = torch.full(size, alphas[index], device=device)
+        a_prev = torch.full(size, alphas_prev[index], device=device)
+        sigma_t = torch.full(size, sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full(size,
+                                       sqrt_one_minus_alphas[index],
+                                       device=device)
+
+        if self.model.parameterization != "v":
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+
+        if self.model.use_dynamic_rescale:
+            scale_t = torch.full(size,
+                                 self.ddim_scale_arr[index],
+                                 device=device)
+            prev_scale_t = torch.full(size,
+                                      self.ddim_scale_arr_prev[index],
+                                      device=device)
+            rescale = (prev_scale_t / scale_t)
+            pred_x0 *= rescale
+
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+
+        noise = sigma_t * noise_like(x.shape, device,
+                                     repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+
+        return x_prev, pred_x0, model_output_action, model_output_state
+
+    @torch.no_grad()
+    def decode(self,
+               x_latent,
+               cond,
+               t_start,
+               unconditional_guidance_scale=1.0,
+               unconditional_conditioning=None,
+               use_original_steps=False,
+               callback=None):
+
+        timesteps = np.arange(self.ddpm_num_timesteps
+                              ) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0], ),
+                            step,
+                            device=x_latent.device,
+                            dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(
+                x_dec,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=use_original_steps,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning)
+            if callback: callback(i)
+        return x_dec
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (
+            extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+            extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) *
+            noise)