KV 融合实现完成。改动总结：速度微弱提升psnr略微上升

attention.py — 3处改动： 1. __init__ 添加 _kv_fused = False 标志 2.新增 fuse_kv() 方法：将 to_k + to_v → to_kv，同时处理 _ip/_as/_aa 辅助 KV 对 2. bmm_forward 两个分支加_kv_fused 判断，用to_kv().chunk(2, dim=-1) 替代分别调用
2026-02-10 18:07:23 +00:00
parent 2cef3e9e45
commit 57ba85d147
4 changed files with 61 additions and 23 deletions
--- a/src/unifolm_wma/modules/attention.py
+++ b/src/unifolm_wma/modules/attention.py
@@ -99,6 +99,7 @@ class CrossAttention(nn.Module):
        self.agent_action_context_len = agent_action_context_len
        self._kv_cache = {}
        self._kv_cache_enabled = False
+        self._kv_fused = False

        self.cross_attention_scale_learnable = cross_attention_scale_learnable
        if self.image_cross_attention:
@@ -116,6 +117,27 @@ class CrossAttention(nn.Module):
                self.register_parameter('alpha_caa',
                                        nn.Parameter(torch.tensor(0.)))

+    def fuse_kv(self):
+        """Fuse to_k/to_v into to_kv (2 Linear → 1). Works for all layers."""
+        k_w = self.to_k.weight  # (inner_dim, context_dim)
+        v_w = self.to_v.weight
+        self.to_kv = nn.Linear(k_w.shape[1], k_w.shape[0] * 2, bias=False)
+        self.to_kv.weight = nn.Parameter(torch.cat([k_w, v_w], dim=0))
+        del self.to_k, self.to_v
+        if self.image_cross_attention:
+            for suffix in ('_ip', '_as', '_aa'):
+                k_attr = f'to_k{suffix}'
+                v_attr = f'to_v{suffix}'
+                kw = getattr(self, k_attr).weight
+                vw = getattr(self, v_attr).weight
+                fused = nn.Linear(kw.shape[1], kw.shape[0] * 2, bias=False)
+                fused.weight = nn.Parameter(torch.cat([kw, vw], dim=0))
+                setattr(self, f'to_kv{suffix}', fused)
+                delattr(self, k_attr)
+                delattr(self, v_attr)
+        self._kv_fused = True
+        return True
+
    def forward(self, x, context=None, mask=None):
        spatial_self_attn = (context is None)
        k_ip, v_ip, out_ip = None, None, None
@@ -276,14 +298,20 @@ class CrossAttention(nn.Module):
                                    self.agent_action_context_len +
                                    self.text_context_len:, :]

-            k = self.to_k(context_ins)
-            v = self.to_v(context_ins)
-            k_ip = self.to_k_ip(context_image)
-            v_ip = self.to_v_ip(context_image)
-            k_as = self.to_k_as(context_agent_state)
-            v_as = self.to_v_as(context_agent_state)
-            k_aa = self.to_k_aa(context_agent_action)
-            v_aa = self.to_v_aa(context_agent_action)
+            if self._kv_fused:
+                k, v = self.to_kv(context_ins).chunk(2, dim=-1)
+                k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
+                k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
+                k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
+            else:
+                k = self.to_k(context_ins)
+                v = self.to_v(context_ins)
+                k_ip = self.to_k_ip(context_image)
+                v_ip = self.to_v_ip(context_image)
+                k_as = self.to_k_as(context_agent_state)
+                v_as = self.to_v_as(context_agent_state)
+                k_aa = self.to_k_aa(context_agent_action)
+                v_aa = self.to_v_aa(context_agent_action)

            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
                          (q, k, v))
@@ -304,8 +332,11 @@ class CrossAttention(nn.Module):
        else:
            if not spatial_self_attn:
                context = context[:, :self.text_context_len, :]
-            k = self.to_k(context)
-            v = self.to_v(context)
+            if self._kv_fused:
+                k, v = self.to_kv(context).chunk(2, dim=-1)
+            else:
+                k = self.to_k(context)
+                v = self.to_v(context)

            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
                          (q, k, v))