- state_unet 放到一个独立的 CUDA stream 上执行

- action_unet 在默认 stream 上同时执行
  - 用 wait_stream 确保两者都完成后再返回
两个 1D UNet 输入完全独立,共享的 hs_a 和 context_action 都是只读的。GPU 利用率只有 ~31%,小张量 kernel 不会打满 GPU,两个 stream 可以真正并行。
This commit is contained in:
qhy
2026-02-10 21:41:48 +08:00
parent ff43432ef9
commit dcbcb2c377
4 changed files with 28 additions and 19 deletions

View File

@@ -688,6 +688,8 @@ class WMAModel(nn.Module):
# Context precomputation cache
self._ctx_cache_enabled = False
self._ctx_cache = {}
# Reusable CUDA stream for parallel state_unet / action_unet
self._state_stream = torch.cuda.Stream()
def forward(self,
x: Tensor,
@@ -842,15 +844,16 @@ class WMAModel(nn.Module):
if not self.base_model_gen_only:
ba, _, _ = x_action.shape
ts_state = timesteps[:ba] if b > 1 else timesteps
# Run action_unet and state_unet in parallel via CUDA streams
s_stream = self._state_stream
s_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s_stream):
s_y = self.state_unet(x_state, ts_state, hs_a,
context_action[:2], **kwargs)
a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
context_action[:2], **kwargs)
# Predict state
if b > 1:
s_y = self.state_unet(x_state, timesteps[:ba], hs_a,
context_action[:2], **kwargs)
else:
s_y = self.state_unet(x_state, timesteps, hs_a,
context_action[:2], **kwargs)
torch.cuda.current_stream().wait_stream(s_stream)
else:
a_y = torch.zeros_like(x_action)
s_y = torch.zeros_like(x_state)