成功的尝试

2026-02-18 19:14:55 +08:00
parent 9a08e27a19
commit 65788be1b3
32 changed files with 1772 additions and 50 deletions
--- a/src/unifolm_wma/modules/networks/wma_model.py
+++ b/src/unifolm_wma/modules/networks/wma_model.py
@@ -688,6 +688,7 @@ class WMAModel(nn.Module):
        # Context precomputation cache
        self._ctx_cache_enabled = False
        self._ctx_cache = {}
+        self._trt_backbone = None  # TRT engine for video UNet backbone
        # Reusable CUDA stream for parallel state_unet / action_unet
        self._state_stream = torch.cuda.Stream()

@@ -700,6 +701,12 @@ class WMAModel(nn.Module):
        self.__dict__.update(state)
        self._state_stream = torch.cuda.Stream()

+    def load_trt_backbone(self, engine_path, n_hs_a=9):
+        """Load a TensorRT engine for the video UNet backbone."""
+        from unifolm_wma.trt_utils import TRTBackbone
+        self._trt_backbone = TRTBackbone(engine_path, n_hs_a=n_hs_a)
+        print(f">>> TRT backbone loaded from {engine_path}")
+
    def forward(self,
                x: Tensor,
                x_action: Tensor,
@@ -812,44 +819,50 @@ class WMAModel(nn.Module):
            fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
            emb = emb + fs_embed

-        h = x.type(self.dtype)
-        adapter_idx = 0
-        hs = []
-        hs_a = []
-        for id, module in enumerate(self.input_blocks):
-            h = module(h, emb, context=context, batch_size=b)
-            if id == 0 and self.addition_attention:
-                h = self.init_attn(h, emb, context=context, batch_size=b)
-            # plug-in adapter features
-            if ((id + 1) % 3 == 0) and features_adapter is not None:
-                h = h + features_adapter[adapter_idx]
-                adapter_idx += 1
-            if id != 0:
-                if isinstance(module[0], Downsample):
+        if self._trt_backbone is not None:
+            # TRT path: run backbone via TensorRT engine
+            h_in = x.type(self.dtype).contiguous()
+            y, hs_a = self._trt_backbone(h_in, emb.contiguous(), context.contiguous())
+        else:
+            # PyTorch path: original backbone
+            h = x.type(self.dtype)
+            adapter_idx = 0
+            hs = []
+            hs_a = []
+            for id, module in enumerate(self.input_blocks):
+                h = module(h, emb, context=context, batch_size=b)
+                if id == 0 and self.addition_attention:
+                    h = self.init_attn(h, emb, context=context, batch_size=b)
+                # plug-in adapter features
+                if ((id + 1) % 3 == 0) and features_adapter is not None:
+                    h = h + features_adapter[adapter_idx]
+                    adapter_idx += 1
+                if id != 0:
+                    if isinstance(module[0], Downsample):
+                        hs_a.append(
+                            rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
+                hs.append(h)
+            hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+            if features_adapter is not None:
+                assert len(
+                    features_adapter) == adapter_idx, 'Wrong features_adapter'
+            h = self.middle_block(h, emb, context=context, batch_size=b)
+            hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+            hs_out = []
+            for module in self.output_blocks:
+                h = torch.cat([h, hs.pop()], dim=1)
+                h = module(h, emb, context=context, batch_size=b)
+                if isinstance(module[-1], Upsample):
                    hs_a.append(
-                        rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
-            hs.append(h)
-        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+                        rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
+                hs_out.append(h)
+            h = h.type(x.dtype)
+            hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))

-        if features_adapter is not None:
-            assert len(
-                features_adapter) == adapter_idx, 'Wrong features_adapter'
-        h = self.middle_block(h, emb, context=context, batch_size=b)
-        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
-
-        hs_out = []
-        for module in self.output_blocks:
-            h = torch.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context=context, batch_size=b)
-            if isinstance(module[-1], Upsample):
-                hs_a.append(
-                    rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
-            hs_out.append(h)
-        h = h.type(x.dtype)
-        hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
-
-        y = self.out(h)
-        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+            y = self.out(h)
+            y = rearrange(y, '(b t) c h w -> b c t h w', b=b)

        if not self.base_model_gen_only:
            ba, _, _ = x_action.shape