速度变化不明显psnr显著提升

2026-02-11 16:38:21 +08:00
parent f386a5810b
commit 3101252c25
4 changed files with 58 additions and 37 deletions
--- a/src/unifolm_wma/models/ddpms.py
+++ b/src/unifolm_wma/models/ddpms.py
@@ -988,7 +988,7 @@ class LatentDiffusion(DDPM):

    def instantiate_cond_stage(self, config: OmegaConf) -> None:
        """
-        Build the conditioning stage model.
+        Build the conditioning stage model. Frozen models are converted to FP16.

        Args:
            config: OmegaConf config describing the conditioning model to instantiate.
@@ -1000,6 +1000,7 @@ class LatentDiffusion(DDPM):
            self.cond_stage_model.train = disabled_train
            for param in self.cond_stage_model.parameters():
                param.requires_grad = False
+            self.cond_stage_model.half()
        else:
            model = instantiate_from_config(config)
            self.cond_stage_model = model
@@ -1014,17 +1015,18 @@ class LatentDiffusion(DDPM):
        Returns:
            Conditioning embedding as a tensor (shape depends on cond model).
        """
-        if self.cond_stage_forward is None:
-            if hasattr(self.cond_stage_model, 'encode') and callable(
-                    self.cond_stage_model.encode):
-                c = self.cond_stage_model.encode(c)
-                if isinstance(c, DiagonalGaussianDistribution):
-                    c = c.mode()
+        with torch.cuda.amp.autocast(dtype=torch.float16):
+            if self.cond_stage_forward is None:
+                if hasattr(self.cond_stage_model, 'encode') and callable(
+                        self.cond_stage_model.encode):
+                    c = self.cond_stage_model.encode(c)
+                    if isinstance(c, DiagonalGaussianDistribution):
+                        c = c.mode()
+                else:
+                    c = self.cond_stage_model(c)
            else:
-                c = self.cond_stage_model(c)
-        else:
-            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+                assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+                c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
        return c

    def get_first_stage_encoding(
@@ -1957,6 +1959,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.image_proj_model.train = disabled_train
            for param in self.image_proj_model.parameters():
                param.requires_grad = False
+            self.image_proj_model.half()

    def _init_embedder(self, config: OmegaConf, freeze: bool = True) -> None:
        """
@@ -1972,6 +1975,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.embedder.train = disabled_train
            for param in self.embedder.parameters():
                param.requires_grad = False
+            self.embedder.half()

    def init_normalizers(self, normalize_config: OmegaConf,
                         dataset_stats: Mapping[str, Any]) -> None:
@@ -2175,8 +2179,9 @@ class LatentVisualDiffusion(LatentDiffusion):
            (random_num < 3 * self.uncond_prob).float(), "n -> n 1 1 1")

        cond_img = input_mask * img
-        cond_img_emb = self.embedder(cond_img)
-        cond_img_emb = self.image_proj_model(cond_img_emb)
+        with torch.cuda.amp.autocast(dtype=torch.float16):
+            cond_img_emb = self.embedder(cond_img)
+            cond_img_emb = self.image_proj_model(cond_img_emb)

        if self.model.conditioning_key == 'hybrid':
            if self.interp_mode:
@@ -2191,11 +2196,12 @@ class LatentVisualDiffusion(LatentDiffusion):
                                      repeat=z.shape[2])
            cond["c_concat"] = [img_cat_cond]

-        cond_action = self.action_projector(action)
-        cond_action_emb = self.agent_action_pos_emb + cond_action
-        # Get conditioning states
-        cond_state = self.state_projector(obs_state)
-        cond_state_emb = self.agent_state_pos_emb + cond_state
+        with torch.cuda.amp.autocast(dtype=torch.float16):
+            cond_action = self.action_projector(action)
+            cond_action_emb = self.agent_action_pos_emb + cond_action
+            # Get conditioning states
+            cond_state = self.state_projector(obs_state)
+            cond_state_emb = self.agent_state_pos_emb + cond_state

        if self.decision_making_only:
            is_sim_mode = False