diff --git a/scripts/evaluation/profile_unet.py b/scripts/evaluation/profile_unet.py
index 1860f27..a58085e 100644
--- a/scripts/evaluation/profile_unet.py
+++ b/scripts/evaluation/profile_unet.py
@@ -50,6 +50,20 @@ PEAK_BF16_TFLOPS = 61.0
 PEAK_FP32_TFLOPS = 30.5
 
 
+def apply_torch_compile(model, hot_indices=(5, 8, 9)):
+    """Compile ResBlock._forward in the hottest output_blocks for operator fusion."""
+    from unifolm_wma.modules.networks.wma_model import ResBlock
+    unet = model.model.diffusion_model
+    compiled = 0
+    for idx in hot_indices:
+        block = unet.output_blocks[idx]
+        for layer in block:
+            if isinstance(layer, ResBlock):
+                layer._forward = torch.compile(layer._forward, mode="default")
+                compiled += 1
+    print(f"    ✓ torch.compile: {compiled} ResBlocks in output_blocks{list(hot_indices)}")
+
+
 def load_model(args):
     config = OmegaConf.load(args.config)
     config['model']['params']['wma_config']['params']['use_checkpoint'] = False
@@ -62,6 +76,7 @@ def load_model(args):
 
     model.eval()
     model.model.to(torch.bfloat16)
+    apply_torch_compile(model)
     model = model.cuda()
     return model
 
diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py
index 3066028..60d7e8f 100644
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -135,6 +135,21 @@ def apply_precision_settings(model: nn.Module, args: argparse.Namespace) -> nn.M
     return model
 
 
+def apply_torch_compile(model, hot_indices=(5, 8, 9)):
+    """Compile ResBlock._forward in the hottest output_blocks for operator fusion."""
+    from unifolm_wma.modules.networks.wma_model import ResBlock
+    unet = model.model.diffusion_model
+    compiled = 0
+    for idx in hot_indices:
+        block = unet.output_blocks[idx]
+        for layer in block:
+            if isinstance(layer, ResBlock):
+                layer._forward = torch.compile(layer._forward, mode="default")
+                compiled += 1
+    print(f"    ✓ torch.compile: {compiled} ResBlocks in output_blocks{list(hot_indices)}")
+    return model
+
+
 def write_video(video_path: str, stacked_frames: list, fps: int) -> None:
     """Save a list of frames to a video file.
 
@@ -601,6 +616,9 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
     # Apply precision settings before moving to GPU
     model = apply_precision_settings(model, args)
 
+    # Compile hot ResBlocks for operator fusion
+    apply_torch_compile(model)
+
     # Export precision-converted checkpoint if requested
     if args.export_precision_ckpt:
         export_path = args.export_precision_ckpt
diff --git a/src/unifolm_wma/models/samplers/ddim.py b/src/unifolm_wma/models/samplers/ddim.py
index e40e055..c9ade02 100644
--- a/src/unifolm_wma/models/samplers/ddim.py
+++ b/src/unifolm_wma/models/samplers/ddim.py
@@ -209,9 +209,9 @@ class DDIMSampler(object):
 
         if precision is not None:
             if precision == 16:
-                img = img.to(dtype=torch.float16)
-                action = action.to(dtype=torch.float16)
-                state = state.to(dtype=torch.float16)
+                img = img.to(dtype=torch.bfloat16)
+                action = action.to(dtype=torch.bfloat16)
+                state = state.to(dtype=torch.bfloat16)
 
         if timesteps is None:
             timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
diff --git a/src/unifolm_wma/modules/attention.py b/src/unifolm_wma/modules/attention.py
index 27161ef..5b7e1b7 100644
--- a/src/unifolm_wma/modules/attention.py
+++ b/src/unifolm_wma/modules/attention.py
@@ -173,7 +173,8 @@ class CrossAttention(nn.Module):
             sim.masked_fill_(~(mask > 0.5), max_neg_value)
 
         # attention, what we cannot get enough of
-        sim = sim.softmax(dim=-1)
+        with torch.amp.autocast('cuda', enabled=False):
+            sim = sim.softmax(dim=-1)
 
         out = torch.einsum('b i j, b j d -> b i d', sim, v)
         if self.relative_position:
@@ -190,7 +191,8 @@ class CrossAttention(nn.Module):
             sim_ip = torch.einsum('b i d, b j d -> b i j', q,
                                   k_ip) * self.scale
             del k_ip
-            sim_ip = sim_ip.softmax(dim=-1)
+            with torch.amp.autocast('cuda', enabled=False):
+                sim_ip = sim_ip.softmax(dim=-1)
             out_ip = torch.einsum('b i j, b j d -> b i d', sim_ip, v_ip)
             out_ip = rearrange(out_ip, '(b h) n d -> b n (h d)', h=h)
 
@@ -201,7 +203,8 @@ class CrossAttention(nn.Module):
             sim_as = torch.einsum('b i d, b j d -> b i j', q,
                                   k_as) * self.scale
             del k_as
-            sim_as = sim_as.softmax(dim=-1)
+            with torch.amp.autocast('cuda', enabled=False):
+                sim_as = sim_as.softmax(dim=-1)
             out_as = torch.einsum('b i j, b j d -> b i d', sim_as, v_as)
             out_as = rearrange(out_as, '(b h) n d -> b n (h d)', h=h)
 
@@ -212,7 +215,8 @@ class CrossAttention(nn.Module):
             sim_aa = torch.einsum('b i d, b j d -> b i j', q,
                                   k_aa) * self.scale
             del k_aa
-            sim_aa = sim_aa.softmax(dim=-1)
+            with torch.amp.autocast('cuda', enabled=False):
+                sim_aa = sim_aa.softmax(dim=-1)
             out_aa = torch.einsum('b i j, b j d -> b i d', sim_aa, v_aa)
             out_aa = rearrange(out_aa, '(b h) n d -> b n (h d)', h=h)
 
diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
index a8d5d16..e9417c8 100644
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -1,12 +1,12 @@
-2026-02-08 15:47:30.035545: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-08 15:47:30.038628: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-08 15:47:30.069635: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-08 15:47:30.069671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-08 15:47:30.071534: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-08 15:47:30.080021: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-08 15:47:30.080300: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-08 16:49:41.598605: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-08 16:49:41.601687: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-08 16:49:41.632954: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-08 16:49:41.632986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-08 16:49:41.634849: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-08 16:49:41.643134: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-08 16:49:41.643414: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-08 15:47:30.746161: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-08 16:49:42.320864: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 [rank: 0] Global seed set to 123
 /mnt/ASC1637/miniconda3/envs/unifolm-wma/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
   @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
@@ -23,7 +23,7 @@ INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
 INFO:root:Loaded ViT-H-14 model config.
 DEBUG:urllib3.connectionpool:https://hf-mirror.com:443 "HEAD /laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin HTTP/1.1" 302 0
 INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
-/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/world_model_interaction.py:183: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/world_model_interaction.py:198: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
   state_dict = torch.load(ckpt, map_location="cpu")
 >>> model checkpoint loaded.
 >>> Load pre-trained model ...
@@ -38,6 +38,7 @@ INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
     ✓ VAE converted to bfloat16
     ⚠ Found 601 fp32 params, converting to bf16
     ✓ All parameters converted to bfloat16
+    ✓ torch.compile: 3 ResBlocks in output_blocks[5, 8, 9]
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
@@ -113,7 +114,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
- 12%|█▎        | 1/8 [01:19<09:17, 79.58s/it] 25%|██▌       | 2/8 [02:38<07:54, 79.06s/it] 38%|███▊      | 3/8 [03:56<06:34, 78.87s/it] 50%|█████     | 4/8 [05:15<05:15, 78.85s/it] 62%|██████▎   | 5/8 [06:34<03:56, 78.84s/it] 75%|███████▌  | 6/8 [07:53<02:37, 78.81s/it] 88%|████████▊ | 7/8 [09:11<01:18, 78.71s/it]100%|██████████| 8/8 [10:30<00:00, 78.66s/it]100%|██████████| 8/8 [10:30<00:00, 78.80s/it]
+ 12%|█▎        | 1/8 [01:15<08:45, 75.10s/it] 25%|██▌       | 2/8 [02:26<07:17, 72.96s/it] 38%|███▊      | 3/8 [03:38<06:01, 72.27s/it] 50%|█████     | 4/8 [04:49<04:48, 72.00s/it] 62%|██████▎   | 5/8 [06:01<03:35, 71.97s/it] 75%|███████▌  | 6/8 [07:12<02:23, 71.77s/it] 88%|████████▊ | 7/8 [08:24<01:11, 71.56s/it]100%|██████████| 8/8 [09:35<00:00, 71.59s/it]100%|██████████| 8/8 [09:35<00:00, 71.96s/it]
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
@@ -137,6 +138,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 
-real	11m29.763s
-user	12m56.891s
-sys	0m55.414s
+real	10m35.511s
+user	12m11.689s
+sys	0m40.191s
diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
index 73fc660..e9d3dba 100644
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
@@ -1,5 +1,5 @@
 {
     "gt_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/unitree_z1_dual_arm_cleanup_pencils_case1_amd.mp4",
     "pred_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
-    "psnr": 30.24435361473318
+    "psnr": 30.058508734449845
 }
\ No newline at end of file