DDIM loop 内小张量分配优化,attention mask 缓存到 GPU

This commit is contained in:
2026-02-08 14:20:48 +00:00
parent e588182642
commit 75c798ded0
3 changed files with 44 additions and 43 deletions

View File

@@ -275,7 +275,8 @@ class CrossAttention(nn.Module):
attn_mask_aa = self._get_attn_mask_aa(x.shape[0],
q.shape[1],
k_aa.shape[1],
block_size=16).to(k_aa.device)
block_size=16,
device=k_aa.device)
else:
if not spatial_self_attn:
assert 1 > 2, ">>> ERROR: you should never go into here ..."
@@ -386,14 +387,26 @@ class CrossAttention(nn.Module):
return self.to_out(out)
def _get_attn_mask_aa(self, b, l1, l2, block_size=16):
def _get_attn_mask_aa(self, b, l1, l2, block_size=16, device=None):
cache_key = (b, l1, l2, block_size)
if hasattr(self, '_attn_mask_aa_cache_key') and self._attn_mask_aa_cache_key == cache_key:
cached = self._attn_mask_aa_cache
if device is not None and cached.device != torch.device(device):
cached = cached.to(device)
self._attn_mask_aa_cache = cached
return cached
target_device = device if device is not None else 'cpu'
num_token = l2 // block_size
start_positions = ((torch.arange(b) % block_size) + 1) * num_token
col_indices = torch.arange(l2)
start_positions = ((torch.arange(b, device=target_device) % block_size) + 1) * num_token
col_indices = torch.arange(l2, device=target_device)
mask_2d = col_indices.unsqueeze(0) >= start_positions.unsqueeze(1)
mask = mask_2d.unsqueeze(1).expand(b, l1, l2)
attn_mask = torch.zeros_like(mask, dtype=torch.float)
attn_mask = torch.zeros(b, l1, l2, dtype=torch.float, device=target_device)
attn_mask[mask] = float('-inf')
self._attn_mask_aa_cache_key = cache_key
self._attn_mask_aa_cache = attn_mask
return attn_mask