Revert "tensorRT engines尝试精度没过，暂时先提交代码，后续再继续调试"

This reverts commit e1f8a83648.
tensorRT engines尝试精度没过，暂时先提交代码，后续再继续调试
2026-02-19 20:22:19 +08:00 · 2026-02-18 18:22:12 +08:00 · 2026-02-18 14:11:55 +08:00 · 2026-02-11 19:21:06 +08:00 · 2026-02-11 17:07:33 +08:00 · 2026-02-11 16:38:21 +08:00
79 changed files with 8088 additions and 329 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -0,0 +1,21 @@
 {
  "permissions": {
    "allow": [
      "Bash(conda env list:*)",
      "Bash(mamba env:*)",
      "Bash(micromamba env list:*)",
      "Bash(echo:*)",
      "Bash(git show:*)",
      "Bash(nvidia-smi:*)",
      "Bash(conda activate unifolm-wma)",
      "Bash(conda info:*)",
      "Bash(direnv allow:*)",
      "Bash(ls:*)",
      "Bash(for scenario in unitree_g1_pack_camera unitree_z1_dual_arm_cleanup_pencils unitree_z1_dual_arm_stackbox unitree_z1_dual_arm_stackbox_v2 unitree_z1_stackbox)",
      "Bash(do for case in case1 case2 case3 case4)",
      "Bash(done)",
      "Bash(chmod:*)",
      "Bash(ln:*)"
    ]
  }
 }
--- a/.envrc
+++ b/.envrc
@@ -0,0 +1,2 @@
 eval "$(conda shell.bash hook 2>/dev/null)"
 conda activate unifolm-wma
--- a/.gitignore
+++ b/.gitignore
@@ -55,7 +55,6 @@ coverage.xml
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
@@ -121,7 +120,7 @@ localTest/
 fig/
 figure/
 *.mp4
-*.json
+
 Data/ControlVAE.yml
 Data/Misc
 Data/Pretrained
@@ -130,3 +129,6 @@ Experiment/checkpoint
 Experiment/log
 *.ckpt
 *.0
 ckpts/unifolm_wma_dual.ckpt.prepared.pt
--- a/configs/inference/world_model_interaction.yaml
+++ b/configs/inference/world_model_interaction.yaml
@@ -222,7 +222,7 @@ data:
    test:
      target: unifolm_wma.data.wma_data.WMAData
      params:
-        data_dir: '/mnt/ASC1637/unifolm-world-model-action/examples/world_model_interaction_prompts'
+        data_dir: '/home/qhy/unifolm-world-model-action/examples/world_model_interaction_prompts'
        video_length: ${model.params.wma_config.params.temporal_length}
        frame_stride: 2
        load_raw_resolution: True
--- a/run_all_case.sh
+++ b/run_all_case.sh
@@ -0,0 +1,114 @@
 #!/bin/bash
 # 自动执行所有场景的所有case
 # 总共5个场景，每个场景4个case，共20个case
 # 设置环境变量（离线模式）
 export HF_HUB_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
 # 颜色定义
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # 定义所有场景
 SCENARIOS=(
    "unitree_g1_pack_camera"
    "unitree_z1_dual_arm_cleanup_pencils"
    "unitree_z1_dual_arm_stackbox"
    "unitree_z1_dual_arm_stackbox_v2"
    "unitree_z1_stackbox"
 )
 # 定义case数量
 CASES=(1 2 3 4)
 # 记录开始时间
 START_TIME=$(date +%s)
 LOG_FILE="run_all_cases_$(date +%Y%m%d_%H%M%S).log"
 echo -e "${BLUE}========================================${NC}"
 echo -e "${BLUE}开始执行所有场景的case${NC}"
 echo -e "${BLUE}总共: ${#SCENARIOS[@]} 个场景 x ${#CASES[@]} 个case = $((${#SCENARIOS[@]} * ${#CASES[@]})) 个任务${NC}"
 echo -e "${BLUE}日志文件: ${LOG_FILE}${NC}"
 echo -e "${BLUE}========================================${NC}"
 echo ""
 # 初始化计数器
 TOTAL_CASES=$((${#SCENARIOS[@]} * ${#CASES[@]}))
 CURRENT_CASE=0
 SUCCESS_COUNT=0
 FAIL_COUNT=0
 # 记录失败的case
 declare -a FAILED_CASES
 # 遍历所有场景
 for scenario in "${SCENARIOS[@]}"; do
    echo -e "${YELLOW}>>> 场景: ${scenario}${NC}"
    # 遍历所有case
    for case_num in "${CASES[@]}"; do
        CURRENT_CASE=$((CURRENT_CASE + 1))
        case_dir="${scenario}/case${case_num}"
        script_path="${case_dir}/run_world_model_interaction.sh"
        echo -e "${BLUE}[${CURRENT_CASE}/${TOTAL_CASES}] 执行: ${case_dir}${NC}"
        # 检查脚本是否存在
        if [ ! -f "${script_path}" ]; then
            echo -e "${RED}错误: 脚本不存在 ${script_path}${NC}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            FAILED_CASES+=("${case_dir} (脚本不存在)")
            continue
        fi
        # 执行脚本
        echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
        if bash "${script_path}" >> "${LOG_FILE}" 2>&1; then
            echo -e "${GREEN}✓ 成功: ${case_dir}${NC}"
            SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
        else
            echo -e "${RED}✗ 失败: ${case_dir}${NC}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            FAILED_CASES+=("${case_dir}")
        fi
        echo "结束时间: $(date '+%Y-%m-%d %H:%M:%S')"
        echo ""
    done
    echo ""
 done
 # 计算总耗时
 END_TIME=$(date +%s)
 DURATION=$((END_TIME - START_TIME))
 HOURS=$((DURATION / 3600))
 MINUTES=$(((DURATION % 3600) / 60))
 SECONDS=$((DURATION % 60))
 # 输出总结
 echo -e "${BLUE}========================================${NC}"
 echo -e "${BLUE}执行完成！${NC}"
 echo -e "${BLUE}========================================${NC}"
 echo -e "总任务数: ${TOTAL_CASES}"
 echo -e "${GREEN}成功: ${SUCCESS_COUNT}${NC}"
 echo -e "${RED}失败: ${FAIL_COUNT}${NC}"
 echo -e "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
 echo -e "详细日志: ${LOG_FILE}"
 echo ""
 # 如果有失败的case，列出来
 if [ ${FAIL_COUNT} -gt 0 ]; then
    echo -e "${RED}失败的case列表:${NC}"
    for failed_case in "${FAILED_CASES[@]}"; do
        echo -e "${RED}  - ${failed_case}${NC}"
    done
    echo ""
 fi
 echo -e "${BLUE}========================================${NC}"
--- a/run_all_cases_20260211_135725.log
+++ b/run_all_cases_20260211_135725.log
--- a/run_all_cases_20260211_173422.log
+++ b/run_all_cases_20260211_173422.log
@@ -0,0 +1,37 @@
 2026-02-11 17:34:29.188470: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:34:29.238296: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:34:29.238342: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:34:29.239649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:34:29.247152: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:34:30.172640: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
--- a/run_all_cases_20260211_173635.log
+++ b/run_all_cases_20260211_173635.log
--- a/run_all_cases_20260211_181733.log
+++ b/run_all_cases_20260211_181733.log
--- a/run_all_psnr.sh
+++ b/run_all_psnr.sh
@@ -0,0 +1,61 @@
 #!/bin/bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 cd "$SCRIPT_DIR"
 SCENARIOS=(
    unitree_g1_pack_camera
    unitree_z1_dual_arm_cleanup_pencils
    unitree_z1_dual_arm_stackbox
    unitree_z1_dual_arm_stackbox_v2
    unitree_z1_stackbox
 )
 CASES=(case1 case2 case3 case4)
 total=0
 success=0
 fail=0
 for scenario in "${SCENARIOS[@]}"; do
    for case in "${CASES[@]}"; do
        case_dir="${scenario}/${case}"
        gt_video="${case_dir}/${scenario}_${case}.mp4"
        pred_video=$(ls "${case_dir}"/output/inference/*_full_fs*.mp4 2>/dev/null | head -1)
        output_file="${case_dir}/psnr_result.json"
        total=$((total + 1))
        echo "=========================================="
        echo "[${total}/20] ${case_dir}"
        if [ ! -f "$gt_video" ]; then
            echo "  SKIP: GT video not found: $gt_video"
            fail=$((fail + 1))
            continue
        fi
        if [ -z "$pred_video" ]; then
            echo "  SKIP: pred video not found in ${case_dir}/output/inference/"
            fail=$((fail + 1))
            continue
        fi
        echo "  GT:   $gt_video"
        echo "  Pred: $pred_video"
        echo "  Out:  $output_file"
        if python3 psnr_score_for_challenge.py \
            --gt_video "$gt_video" \
            --pred_video "$pred_video" \
            --output_file "$output_file"; then
            success=$((success + 1))
            echo "  DONE"
        else
            fail=$((fail + 1))
            echo "  FAILED"
        fi
    done
 done
 echo "=========================================="
 echo "Finished: ${success} success, ${fail} fail, ${total} total"
--- a/scripts/evaluation/base_model_inference.py
+++ b/scripts/evaluation/base_model_inference.py
@@ -16,6 +16,9 @@ from collections import OrderedDict
 from unifolm_wma.models.samplers.ddim import DDIMSampler
 from unifolm_wma.utils.utils import instantiate_from_config
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 def get_filelist(data_dir: str, postfixes: list[str]) -> list[str]:
    """
--- a/scripts/evaluation/real_eval_server.py
+++ b/scripts/evaluation/real_eval_server.py
@@ -19,6 +19,9 @@ from fastapi.responses import JSONResponse
 from typing import Any, Dict, Optional, Tuple, List
 from datetime import datetime
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 from unifolm_wma.utils.utils import instantiate_from_config
 from unifolm_wma.models.samplers.ddim import DDIMSampler
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -9,6 +9,8 @@ import logging
 import einops
 import warnings
 import imageio
 import atexit
 from concurrent.futures import ThreadPoolExecutor
 from pytorch_lightning import seed_everything
 from omegaconf import OmegaConf
@@ -16,8 +18,12 @@ from tqdm import tqdm
 from einops import rearrange, repeat
 from collections import OrderedDict
 from torch import nn
-from eval_utils import populate_queues, log_to_tensorboard
+from eval_utils import populate_queues
 from collections import deque
 from typing import Optional, List, Any
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 from torch import Tensor
 from torch.utils.tensorboard import SummaryWriter
 from PIL import Image
@@ -150,6 +156,81 @@ def save_results(video: Tensor, filename: str, fps: int = 8) -> None:
                               options={'crf': '10'})
 # ========== Async I/O ==========
 _io_executor: Optional[ThreadPoolExecutor] = None
 _io_futures: List[Any] = []
 def _get_io_executor() -> ThreadPoolExecutor:
    global _io_executor
    if _io_executor is None:
        _io_executor = ThreadPoolExecutor(max_workers=2)
    return _io_executor
 def _flush_io():
    """Wait for all pending async I/O to finish."""
    global _io_futures
    for fut in _io_futures:
        try:
            fut.result()
        except Exception as e:
            print(f">>> [async I/O] error: {e}")
    _io_futures.clear()
 atexit.register(_flush_io)
 def _save_results_sync(video_cpu: Tensor, filename: str, fps: int) -> None:
    """Synchronous save on CPU tensor (runs in background thread)."""
    video = torch.clamp(video_cpu.float(), -1., 1.)
    n = video.shape[0]
    video = video.permute(2, 0, 1, 3, 4)
    frame_grids = [
        torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
        for framesheet in video
    ]
    grid = torch.stack(frame_grids, dim=0)
    grid = (grid + 1.0) / 2.0
    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
    torchvision.io.write_video(filename,
                               grid,
                               fps=fps,
                               video_codec='h264',
                               options={'crf': '10'})
 def save_results_async(video: Tensor, filename: str, fps: int = 8) -> None:
    """Submit video saving to background thread pool."""
    video_cpu = video.detach().cpu()
    fut = _get_io_executor().submit(_save_results_sync, video_cpu, filename, fps)
    _io_futures.append(fut)
 def _log_to_tb_sync(writer, video_cpu: Tensor, tag: str, fps: int) -> None:
    """Synchronous TensorBoard log on CPU tensor (runs in background thread)."""
    if video_cpu.dim() == 5:
        n = video_cpu.shape[0]
        video = video_cpu.permute(2, 0, 1, 3, 4)
        frame_grids = [
            torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
            for framesheet in video
        ]
        grid = torch.stack(frame_grids, dim=0)
        grid = (grid + 1.0) / 2.0
        grid = grid.unsqueeze(dim=0)
        writer.add_video(tag, grid, fps=fps)
 def log_to_tensorboard_async(writer, data: Tensor, tag: str, fps: int = 10) -> None:
    """Submit TensorBoard logging to background thread pool."""
    if isinstance(data, torch.Tensor) and data.dim() == 5:
        data_cpu = data.detach().cpu()
        fut = _get_io_executor().submit(_log_to_tb_sync, writer, data_cpu, tag, fps)
        _io_futures.append(fut)
 def get_init_frame_path(data_dir: str, sample: dict) -> str:
    """Construct the init_frame path from directory and sample metadata.
@@ -327,7 +408,8 @@ def image_guided_synthesis_sim_mode(
        timestep_spacing: str = 'uniform',
        guidance_rescale: float = 0.0,
        sim_mode: bool = True,
-        **kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        decode_video: bool = True,
        **kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
    """
    Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text).
@@ -350,10 +432,13 @@ def image_guided_synthesis_sim_mode(
        timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace".
        guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance.
        sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model.
        decode_video (bool): Whether to decode latent samples to pixel-space video.
            Set to False to skip VAE decode for speed when only actions/states are needed.
        **kwargs: Additional arguments passed to the DDIM sampler.
    Returns:
-        batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W].
+        batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W],
            or None when decode_video=False.
        actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding.
        states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding.
    """
@@ -365,8 +450,9 @@ def image_guided_synthesis_sim_mode(
    img = observation['observation.images.top'].permute(0, 2, 1, 3, 4)
    cond_img = rearrange(img, 'b o c h w -> (b o) c h w')[-1:]
-    cond_img_emb = model.embedder(cond_img)
+    with torch.cuda.amp.autocast(dtype=torch.float16):
-    cond_img_emb = model.image_proj_model(cond_img_emb)
+        cond_img_emb = model.embedder(cond_img)
        cond_img_emb = model.image_proj_model(cond_img_emb)
    if model.model.conditioning_key == 'hybrid':
        z = get_latent_z(model, img.permute(0, 2, 1, 3, 4))
@@ -380,11 +466,12 @@ def image_guided_synthesis_sim_mode(
        prompts = [""] * batch_size
    cond_ins_emb = model.get_learned_conditioning(prompts)
-    cond_state_emb = model.state_projector(observation['observation.state'])
+    with torch.cuda.amp.autocast(dtype=torch.float16):
-    cond_state_emb = cond_state_emb + model.agent_state_pos_emb
+        cond_state_emb = model.state_projector(observation['observation.state'])
        cond_state_emb = cond_state_emb + model.agent_state_pos_emb
-    cond_action_emb = model.action_projector(observation['action'])
+        cond_action_emb = model.action_projector(observation['action'])
-    cond_action_emb = cond_action_emb + model.agent_action_pos_emb
+        cond_action_emb = cond_action_emb + model.agent_action_pos_emb
    if not sim_mode:
        cond_action_emb = torch.zeros_like(cond_action_emb)
@@ -406,6 +493,8 @@ def image_guided_synthesis_sim_mode(
    kwargs.update({"unconditional_conditioning_img_nonetext": None})
    cond_mask = None
    cond_z0 = None
    batch_variants = None
    samples = None
    if ddim_sampler is not None:
        samples, actions, states, intermedia = ddim_sampler.sample(
            S=ddim_steps,
@@ -424,11 +513,12 @@ def image_guided_synthesis_sim_mode(
            guidance_rescale=guidance_rescale,
            **kwargs)
-        # Reconstruct from latent to pixel space
+        if decode_video:
-        batch_images = model.decode_first_stage(samples)
+            # Reconstruct from latent to pixel space
-        batch_variants = batch_images
+            batch_images = model.decode_first_stage(samples)
            batch_variants = batch_images
-    return batch_variants, actions, states
+    return batch_variants, actions, states, samples
 def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
@@ -453,26 +543,67 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
    csv_path = os.path.join(args.prompt_dir, f"{args.dataset}.csv")
    df = pd.read_csv(csv_path)
-    # Load config
+    # Load config (always needed for data setup)
    config = OmegaConf.load(args.config)
    config['model']['params']['wma_config']['params'][
        'use_checkpoint'] = False
    model = instantiate_from_config(config.model)
    model.perframe_ae = args.perframe_ae
    assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
    model = load_model_checkpoint(model, args.ckpt_path)
    model.eval()
    print(f'>>> Load pre-trained model ...')
-    # Build unnomalizer
+    prepared_path = args.ckpt_path + ".prepared.pt"
    if os.path.exists(prepared_path):
        # ---- Fast path: load the fully-prepared model ----
        print(f">>> Loading prepared model from {prepared_path} ...")
        model = torch.load(prepared_path,
                           map_location=f"cuda:{gpu_no}",
                           weights_only=False,
                           mmap=True)
        model.eval()
        print(f">>> Prepared model loaded.")
    else:
        # ---- Normal path: construct + load checkpoint ----
        config['model']['params']['wma_config']['params'][
            'use_checkpoint'] = False
        model = instantiate_from_config(config.model)
        model.perframe_ae = args.perframe_ae
        assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
        model = load_model_checkpoint(model, args.ckpt_path)
        model.eval()
        model = model.cuda(gpu_no)
        print(f'>>> Load pre-trained model ...')
        # Save prepared model for fast loading next time
        print(f">>> Saving prepared model to {prepared_path} ...")
        torch.save(model, prepared_path)
        print(f">>> Prepared model saved ({os.path.getsize(prepared_path) / 1024**3:.1f} GB).")
    # ---- FP16: convert diffusion backbone + conditioning modules ----
    model.model.to(torch.float16)
    model.model.diffusion_model.dtype = torch.float16
    print(">>> Diffusion backbone (model.model) converted to FP16.")
    # Projectors / MLP → FP16
    model.image_proj_model.half()
    model.state_projector.half()
    model.action_projector.half()
    print(">>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.")
    # Text/image encoders → FP16
    model.cond_stage_model.half()
    model.embedder.half()
    print(">>> Encoders (cond_stage_model, embedder) converted to FP16.")
    # Build normalizer (always needed, independent of model loading path)
    logging.info("***** Configing Data *****")
    data = instantiate_from_config(config.data)
    data.setup()
    print(">>> Dataset is successfully loaded ...")
    model = model.cuda(gpu_no)
    device = get_device_from_parameters(model)
    # Fuse KV projections in attention layers (to_k + to_v → to_kv)
    from unifolm_wma.modules.attention import CrossAttention
    kv_count = sum(1 for m in model.modules()
                   if isinstance(m, CrossAttention) and m.fuse_kv())
    print(f"    ✓ KV fused: {kv_count} attention layers")
    # Run over data
    assert (args.height % 16 == 0) and (
        args.width % 16
@@ -518,7 +649,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
            sample_save_dir = f'{video_save_dir}/wm/{fs}'
            os.makedirs(sample_save_dir, exist_ok=True)
            # For collecting interaction videos
-            wm_video = []
+            wm_latent = []
            # Initialize observation queues
            cond_obs_queues = {
                "observation.images.top":
@@ -574,7 +705,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                # Use world-model in policy to generate action
                print(f'>>> Step {itr}: generating actions ...')
-                pred_videos_0, pred_actions, _ = image_guided_synthesis_sim_mode(
+                pred_videos_0, pred_actions, _, _ = image_guided_synthesis_sim_mode(
                    model,
                    sample['instruction'],
                    observation,
@@ -587,7 +718,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    fs=model_input_fs,
                    timestep_spacing=args.timestep_spacing,
                    guidance_rescale=args.guidance_rescale,
-                    sim_mode=False)
+                    sim_mode=False,
                    decode_video=not args.fast_policy_no_decode)
                # Update future actions in the observation queues
                for idx in range(len(pred_actions[0])):
@@ -615,7 +747,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                # Interaction with the world-model
                print(f'>>> Step {itr}: interacting with world model ...')
-                pred_videos_1, _, pred_states = image_guided_synthesis_sim_mode(
+                pred_videos_1, _, pred_states, wm_samples = image_guided_synthesis_sim_mode(
                    model,
                    "",
                    observation,
@@ -628,12 +760,16 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    fs=model_input_fs,
                    text_input=False,
                    timestep_spacing=args.timestep_spacing,
-                    guidance_rescale=args.guidance_rescale)
+                    guidance_rescale=args.guidance_rescale,
                    decode_video=False)
                # Decode only the last frame for CLIP embedding in next iteration
                last_frame_pixel = model.decode_first_stage(wm_samples[:, :, -1:, :, :])
                for idx in range(args.exe_steps):
                    observation = {
                        'observation.images.top':
-                        pred_videos_1[0][:, idx:idx + 1].permute(1, 0, 2, 3),
+                        last_frame_pixel[0, :, 0:1].permute(1, 0, 2, 3),
                        'observation.state':
                        torch.zeros_like(pred_states[0][idx:idx + 1]) if
                        args.zero_pred_state else pred_states[0][idx:idx + 1],
@@ -644,42 +780,31 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    cond_obs_queues = populate_queues(cond_obs_queues,
                                                      observation)
-                # Save the imagen videos for decision-making
+                # Save the imagen videos for decision-making (async)
-                sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
+                if pred_videos_0 is not None:
-                log_to_tensorboard(writer,
+                    sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
-                                   pred_videos_0,
+                    log_to_tensorboard_async(writer,
-                                   sample_tag,
+                                             pred_videos_0,
-                                   fps=args.save_fps)
+                                             sample_tag,
-                # Save videos environment changes via world-model interaction
+                                             fps=args.save_fps)
                sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
                log_to_tensorboard(writer,
                                   pred_videos_1,
                                   sample_tag,
                                   fps=args.save_fps)
                # Save the imagen videos for decision-making
                sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
                save_results(pred_videos_0.cpu(),
                             sample_video_file,
                             fps=args.save_fps)
                # Save videos environment changes via world-model interaction
                sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
                save_results(pred_videos_1.cpu(),
                             sample_video_file,
                             fps=args.save_fps)
                print('>' * 24)
-                # Collect the result of world-model interactions
+                # Store raw latent for deferred decode
-                wm_video.append(pred_videos_1[:, :, :args.exe_steps].cpu())
+                wm_latent.append(wm_samples[:, :, :args.exe_steps].cpu())
-            full_video = torch.cat(wm_video, dim=2)
+            # Deferred decode: batch decode all stored latents
            full_latent = torch.cat(wm_latent, dim=2).to(device)
            full_video = model.decode_first_stage(full_latent).cpu()
            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
-            log_to_tensorboard(writer,
+            log_to_tensorboard_async(writer,
-                               full_video,
+                                     full_video,
-                               sample_tag,
+                                     sample_tag,
-                               fps=args.save_fps)
+                                     fps=args.save_fps)
            sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4"
-            save_results(full_video, sample_full_video_file, fps=args.save_fps)
+            save_results_async(full_video, sample_full_video_file, fps=args.save_fps)
    # Wait for all async I/O to complete
    _flush_io()
 def get_parser():
@@ -794,6 +919,11 @@ def get_parser():
                        action='store_true',
                        default=False,
                        help="not using the predicted states as comparison")
    parser.add_argument(
        "--fast_policy_no_decode",
        action='store_true',
        default=True,
        help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.")
    parser.add_argument("--save_fps",
                        type=int,
                        default=8,
--- a/scripts/trainer.py
+++ b/scripts/trainer.py
@@ -11,6 +11,9 @@ from unifolm_wma.utils.utils import instantiate_from_config
 from unifolm_wma.utils.train import get_trainer_callbacks, get_trainer_logger, get_trainer_strategy
 from unifolm_wma.utils.train import set_logger, init_workspace, load_checkpoints, get_num_parameters
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 def get_parser(**parser_kwargs):
    parser = argparse.ArgumentParser(**parser_kwargs)
--- a/src/unifolm_wma/models/ddpms.py
+++ b/src/unifolm_wma/models/ddpms.py
@@ -988,7 +988,7 @@ class LatentDiffusion(DDPM):
    def instantiate_cond_stage(self, config: OmegaConf) -> None:
        """
-        Build the conditioning stage model.
+        Build the conditioning stage model. Frozen models are converted to FP16.
        Args:
            config: OmegaConf config describing the conditioning model to instantiate.
@@ -1000,6 +1000,7 @@ class LatentDiffusion(DDPM):
            self.cond_stage_model.train = disabled_train
            for param in self.cond_stage_model.parameters():
                param.requires_grad = False
            self.cond_stage_model.half()
        else:
            model = instantiate_from_config(config)
            self.cond_stage_model = model
@@ -1014,17 +1015,18 @@ class LatentDiffusion(DDPM):
        Returns:
            Conditioning embedding as a tensor (shape depends on cond model).
        """
-        if self.cond_stage_forward is None:
+        with torch.cuda.amp.autocast(dtype=torch.float16):
-            if hasattr(self.cond_stage_model, 'encode') and callable(
+            if self.cond_stage_forward is None:
-                    self.cond_stage_model.encode):
+                if hasattr(self.cond_stage_model, 'encode') and callable(
-                c = self.cond_stage_model.encode(c)
+                        self.cond_stage_model.encode):
-                if isinstance(c, DiagonalGaussianDistribution):
+                    c = self.cond_stage_model.encode(c)
-                    c = c.mode()
+                    if isinstance(c, DiagonalGaussianDistribution):
                        c = c.mode()
                else:
                    c = self.cond_stage_model(c)
            else:
-                c = self.cond_stage_model(c)
+                assert hasattr(self.cond_stage_model, self.cond_stage_forward)
-        else:
+                c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
        return c
    def get_first_stage_encoding(
@@ -1957,6 +1959,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.image_proj_model.train = disabled_train
            for param in self.image_proj_model.parameters():
                param.requires_grad = False
            self.image_proj_model.half()
    def _init_embedder(self, config: OmegaConf, freeze: bool = True) -> None:
        """
@@ -1972,6 +1975,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.embedder.train = disabled_train
            for param in self.embedder.parameters():
                param.requires_grad = False
            self.embedder.half()
    def init_normalizers(self, normalize_config: OmegaConf,
                         dataset_stats: Mapping[str, Any]) -> None:
@@ -2175,8 +2179,9 @@ class LatentVisualDiffusion(LatentDiffusion):
            (random_num < 3 * self.uncond_prob).float(), "n -> n 1 1 1")
        cond_img = input_mask * img
-        cond_img_emb = self.embedder(cond_img)
+        with torch.cuda.amp.autocast(dtype=torch.float16):
-        cond_img_emb = self.image_proj_model(cond_img_emb)
+            cond_img_emb = self.embedder(cond_img)
            cond_img_emb = self.image_proj_model(cond_img_emb)
        if self.model.conditioning_key == 'hybrid':
            if self.interp_mode:
@@ -2191,11 +2196,12 @@ class LatentVisualDiffusion(LatentDiffusion):
                                      repeat=z.shape[2])
            cond["c_concat"] = [img_cat_cond]
-        cond_action = self.action_projector(action)
+        with torch.cuda.amp.autocast(dtype=torch.float16):
-        cond_action_emb = self.agent_action_pos_emb + cond_action
+            cond_action = self.action_projector(action)
-        # Get conditioning states
+            cond_action_emb = self.agent_action_pos_emb + cond_action
-        cond_state = self.state_projector(obs_state)
+            # Get conditioning states
-        cond_state_emb = self.agent_state_pos_emb + cond_state
+            cond_state = self.state_projector(obs_state)
            cond_state_emb = self.agent_state_pos_emb + cond_state
        if self.decision_making_only:
            is_sim_mode = False
@@ -2457,7 +2463,17 @@ class DiffusionWrapper(pl.LightningModule):
        Returns:
            Output from the inner diffusion model (tensor or tuple, depending on the model).
        """
        with torch.cuda.amp.autocast(dtype=torch.float16):
            return self._forward_impl(x, x_action, x_state, t,
                                      c_concat, c_crossattn, c_crossattn_action,
                                      c_adm, s, mask, **kwargs)
    def _forward_impl(
        self,
        x, x_action, x_state, t,
        c_concat=None, c_crossattn=None, c_crossattn_action=None,
        c_adm=None, s=None, mask=None, **kwargs,
    ):
        if self.conditioning_key is None:
            out = self.diffusion_model(x, t)
        elif self.conditioning_key == 'concat':
--- a/src/unifolm_wma/models/diffusion_head/conditional_unet1d.py
+++ b/src/unifolm_wma/models/diffusion_head/conditional_unet1d.py
@@ -501,6 +501,10 @@ class ConditionalUnet1D(nn.Module):
        self.last_frame_only = last_frame_only
        self.horizon = horizon
        # Context precomputation cache
        self._global_cond_cache_enabled = False
        self._global_cond_cache = {}
    def forward(self,
                sample: torch.Tensor,
                timestep: Union[torch.Tensor, float, int],
@@ -530,14 +534,20 @@ class ConditionalUnet1D(nn.Module):
        B, T, D = sample.shape
        if self.use_linear_act_proj:
            sample = self.proj_in_action(sample.unsqueeze(-1))
-            global_cond = self.obs_encoder(cond)
+            _gc_key = (cond['image'].data_ptr(), cond['agent_pos'].data_ptr())
-            global_cond = rearrange(global_cond,
+            if self._global_cond_cache_enabled and _gc_key in self._global_cond_cache:
-                                    '(b t) d -> b 1 (t d)',
+                global_cond = self._global_cond_cache[_gc_key]
-                                    b=B,
+            else:
-                                    t=self.n_obs_steps)
+                global_cond = self.obs_encoder(cond)
-            global_cond = repeat(global_cond,
+                global_cond = rearrange(global_cond,
-                                 'b c d -> b (repeat c) d',
+                                        '(b t) d -> b 1 (t d)',
-                                 repeat=T)
+                                        b=B,
                                        t=self.n_obs_steps)
                global_cond = repeat(global_cond,
                                     'b c d -> b (repeat c) d',
                                     repeat=T)
                if self._global_cond_cache_enabled:
                    self._global_cond_cache[_gc_key] = global_cond
        else:
            sample = einops.rearrange(sample, 'b h t -> b t h')
            sample = self.proj_in_horizon(sample)
--- a/src/unifolm_wma/models/samplers/ddim.py
+++ b/src/unifolm_wma/models/samplers/ddim.py
@@ -6,6 +6,8 @@ from unifolm_wma.utils.diffusion import make_ddim_sampling_parameters, make_ddim
 from unifolm_wma.utils.common import noise_like
 from unifolm_wma.utils.common import extract_into_tensor
 from tqdm import tqdm
 from unifolm_wma.modules.attention import enable_cross_attn_kv_cache, disable_cross_attn_kv_cache
 from unifolm_wma.modules.networks.wma_model import enable_ctx_cache, disable_ctx_cache
 class DDIMSampler(object):
@@ -67,11 +69,12 @@ class DDIMSampler(object):
            ddim_timesteps=self.ddim_timesteps,
            eta=ddim_eta,
            verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        # Ensure tensors are on correct device for efficient indexing
-        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_sigmas', to_torch(torch.as_tensor(ddim_sigmas)))
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_alphas', to_torch(torch.as_tensor(ddim_alphas)))
        self.register_buffer('ddim_alphas_prev', to_torch(torch.as_tensor(ddim_alphas_prev)))
        self.register_buffer('ddim_sqrt_one_minus_alphas',
-                             np.sqrt(1. - ddim_alphas))
+                             to_torch(torch.as_tensor(np.sqrt(1. - ddim_alphas))))
        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) *
            (1 - self.alphas_cumprod / self.alphas_cumprod_prev))
@@ -241,63 +244,70 @@ class DDIMSampler(object):
        dp_ddim_scheduler_action.set_timesteps(len(timesteps))
        dp_ddim_scheduler_state.set_timesteps(len(timesteps))
-        for i, step in enumerate(iterator):
+        ts = torch.empty((b, ), device=device, dtype=torch.long)
-            index = total_steps - i - 1
+        enable_cross_attn_kv_cache(self.model)
-            ts = torch.full((b, ), step, device=device, dtype=torch.long)
+        enable_ctx_cache(self.model)
        try:
            for i, step in enumerate(iterator):
                index = total_steps - i - 1
                ts.fill_(step)
-            # Use mask to blend noised original latent (img_orig) & new sampled latent (img)
+                # Use mask to blend noised original latent (img_orig) & new sampled latent (img)
-            if mask is not None:
+                if mask is not None:
-                assert x0 is not None
+                    assert x0 is not None
-                if clean_cond:
+                    if clean_cond:
-                    img_orig = x0
+                        img_orig = x0
-                else:
+                    else:
-                    img_orig = self.model.q_sample(x0, ts)
+                        img_orig = self.model.q_sample(x0, ts)
-                img = img_orig * mask + (1. - mask) * img
+                    img = img_orig * mask + (1. - mask) * img
-            outs = self.p_sample_ddim(
+                outs = self.p_sample_ddim(
-                img,
+                    img,
-                action,
+                    action,
-                state,
+                    state,
-                cond,
+                    cond,
-                ts,
+                    ts,
-                index=index,
+                    index=index,
-                use_original_steps=ddim_use_original_steps,
+                    use_original_steps=ddim_use_original_steps,
-                quantize_denoised=quantize_denoised,
+                    quantize_denoised=quantize_denoised,
-                temperature=temperature,
+                    temperature=temperature,
-                noise_dropout=noise_dropout,
+                    noise_dropout=noise_dropout,
-                score_corrector=score_corrector,
+                    score_corrector=score_corrector,
-                corrector_kwargs=corrector_kwargs,
+                    corrector_kwargs=corrector_kwargs,
-                unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
-                unconditional_conditioning=unconditional_conditioning,
+                    unconditional_conditioning=unconditional_conditioning,
-                mask=mask,
+                    mask=mask,
-                x0=x0,
+                    x0=x0,
-                fs=fs,
+                    fs=fs,
-                guidance_rescale=guidance_rescale,
+                    guidance_rescale=guidance_rescale,
-                **kwargs)
+                    **kwargs)
-            img, pred_x0, model_output_action, model_output_state = outs
+                img, pred_x0, model_output_action, model_output_state = outs
-            action = dp_ddim_scheduler_action.step(
+                action = dp_ddim_scheduler_action.step(
-                model_output_action,
+                    model_output_action,
-                step,
+                    step,
-                action,
+                    action,
-                generator=None,
+                    generator=None,
-            ).prev_sample
+                ).prev_sample
-            state = dp_ddim_scheduler_state.step(
+                state = dp_ddim_scheduler_state.step(
-                model_output_state,
+                    model_output_state,
-                step,
+                    step,
-                state,
+                    state,
-                generator=None,
+                    generator=None,
-            ).prev_sample
+                ).prev_sample
-            if callback: callback(i)
+                if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
+                if img_callback: img_callback(pred_x0, i)
-            if index % log_every_t == 0 or index == total_steps - 1:
+                if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
+                    intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
+                    intermediates['pred_x0'].append(pred_x0)
-                intermediates['x_inter_action'].append(action)
+                    intermediates['x_inter_action'].append(action)
-                intermediates['x_inter_state'].append(state)
+                    intermediates['x_inter_state'].append(state)
        finally:
            disable_cross_attn_kv_cache(self.model)
            disable_ctx_cache(self.model)
        return img, action, state, intermediates
@@ -325,10 +335,6 @@ class DDIMSampler(object):
                      guidance_rescale=0.0,
                      **kwargs):
        b, *_, device = *x.shape, x.device
        if x.dim() == 5:
            is_video = True
        else:
            is_video = False
        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
            model_output, model_output_action, model_output_state = self.model.apply_model(
@@ -377,17 +383,11 @@ class DDIMSampler(object):
        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
        sigmas = self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        if is_video:
+        # Use 0-d tensors directly (already on device); broadcasting handles shape
-            size = (b, 1, 1, 1, 1)
+        a_t = alphas[index]
-        else:
+        a_prev = alphas_prev[index]
-            size = (b, 1, 1, 1)
+        sigma_t = sigmas[index]
-
+        sqrt_one_minus_at = sqrt_one_minus_alphas[index]
        a_t = torch.full(size, alphas[index], device=device)
        a_prev = torch.full(size, alphas_prev[index], device=device)
        sigma_t = torch.full(size, sigmas[index], device=device)
        sqrt_one_minus_at = torch.full(size,
                                       sqrt_one_minus_alphas[index],
                                       device=device)
        if self.model.parameterization != "v":
            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
@@ -395,12 +395,8 @@ class DDIMSampler(object):
            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
        if self.model.use_dynamic_rescale:
-            scale_t = torch.full(size,
+            scale_t = self.ddim_scale_arr[index]
-                                 self.ddim_scale_arr[index],
+            prev_scale_t = self.ddim_scale_arr_prev[index]
                                 device=device)
            prev_scale_t = torch.full(size,
                                      self.ddim_scale_arr_prev[index],
                                      device=device)
            rescale = (prev_scale_t / scale_t)
            pred_x0 *= rescale
--- a/src/unifolm_wma/modules/attention.py
+++ b/src/unifolm_wma/modules/attention.py
@@ -98,6 +98,10 @@ class CrossAttention(nn.Module):
        self.text_context_len = text_context_len
        self.agent_state_context_len = agent_state_context_len
        self.agent_action_context_len = agent_action_context_len
        self._kv_cache = {}
        self._kv_cache_enabled = False
        self._kv_fused = False
        self.cross_attention_scale_learnable = cross_attention_scale_learnable
        if self.image_cross_attention:
            self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
@@ -114,6 +118,27 @@ class CrossAttention(nn.Module):
                self.register_parameter('alpha_caa',
                                        nn.Parameter(torch.tensor(0.)))
    def fuse_kv(self):
        """Fuse to_k/to_v into to_kv (2 Linear → 1). Works for all layers."""
        k_w = self.to_k.weight  # (inner_dim, context_dim)
        v_w = self.to_v.weight
        self.to_kv = nn.Linear(k_w.shape[1], k_w.shape[0] * 2, bias=False)
        self.to_kv.weight = nn.Parameter(torch.cat([k_w, v_w], dim=0))
        del self.to_k, self.to_v
        if self.image_cross_attention:
            for suffix in ('_ip', '_as', '_aa'):
                k_attr = f'to_k{suffix}'
                v_attr = f'to_v{suffix}'
                kw = getattr(self, k_attr).weight
                vw = getattr(self, v_attr).weight
                fused = nn.Linear(kw.shape[1], kw.shape[0] * 2, bias=False)
                fused.weight = nn.Parameter(torch.cat([kw, vw], dim=0))
                setattr(self, f'to_kv{suffix}', fused)
                delattr(self, k_attr)
                delattr(self, v_attr)
        self._kv_fused = True
        return True
    def forward(self, x, context=None, mask=None):
        spatial_self_attn = (context is None)
        k_ip, v_ip, out_ip = None, None, None
@@ -140,19 +165,28 @@ class CrossAttention(nn.Module):
                                    self.agent_action_context_len +
                                    self.text_context_len:, :]
-            k = self.to_k(context_ins)
+            if self._kv_fused:
-            v = self.to_v(context_ins)
+                k, v = self.to_kv(context_ins).chunk(2, dim=-1)
-            k_ip = self.to_k_ip(context_image)
+                k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
-            v_ip = self.to_v_ip(context_image)
+                k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
-            k_as = self.to_k_as(context_agent_state)
+                k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
-            v_as = self.to_v_as(context_agent_state)
+            else:
-            k_aa = self.to_k_aa(context_agent_action)
+                k = self.to_k(context_ins)
-            v_aa = self.to_v_aa(context_agent_action)
+                v = self.to_v(context_ins)
                k_ip = self.to_k_ip(context_image)
                v_ip = self.to_v_ip(context_image)
                k_as = self.to_k_as(context_agent_state)
                v_as = self.to_v_as(context_agent_state)
                k_aa = self.to_k_aa(context_agent_action)
                v_aa = self.to_v_aa(context_agent_action)
        else:
            if not spatial_self_attn:
                context = context[:, :self.text_context_len, :]
-            k = self.to_k(context)
+            if self._kv_fused:
-            v = self.to_v(context)
+                k, v = self.to_kv(context).chunk(2, dim=-1)
            else:
                k = self.to_k(context)
                v = self.to_v(context)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
                      (q, k, v))
@@ -236,134 +270,162 @@ class CrossAttention(nn.Module):
        k_ip, v_ip, out_ip = None, None, None
        k_as, v_as, out_as = None, None, None
        k_aa, v_aa, out_aa = None, None, None
        attn_mask_aa = None
        h = self.heads
        q = self.to_q(x)
        context = default(context, x)
-        if self.image_cross_attention and not spatial_self_attn:
+        b, _, _ = q.shape
        q = q.unsqueeze(3).reshape(b, q.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, q.shape[1], self.dim_head).contiguous()
        def _reshape_kv(t):
            return t.unsqueeze(3).reshape(b, t.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, t.shape[1], self.dim_head).contiguous()
        use_cache = self._kv_cache_enabled and not spatial_self_attn
        cache_hit = use_cache and len(self._kv_cache) > 0
        if cache_hit:
            k = self._kv_cache['k']
            v = self._kv_cache['v']
            k_ip = self._kv_cache.get('k_ip')
            v_ip = self._kv_cache.get('v_ip')
            k_as = self._kv_cache.get('k_as')
            v_as = self._kv_cache.get('v_as')
            k_aa = self._kv_cache.get('k_aa')
            v_aa = self._kv_cache.get('v_aa')
            attn_mask_aa = self._kv_cache.get('attn_mask_aa')
        elif self.image_cross_attention and not spatial_self_attn:
            if context.shape[1] == self.text_context_len + self.video_length:
                context_ins, context_image = context[:, :self.text_context_len, :], context[:,self.text_context_len:, :]
-                k = self.to_k(context)
+                if self._kv_fused:
-                v = self.to_v(context)
+                    k, v = self.to_kv(context).chunk(2, dim=-1)
-                k_ip = self.to_k_ip(context_image)
+                    k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
-                v_ip = self.to_v_ip(context_image)
+                else:
                    k = self.to_k(context)
                    v = self.to_v(context)
                    k_ip = self.to_k_ip(context_image)
                    v_ip = self.to_v_ip(context_image)
                k, v = map(_reshape_kv, (k, v))
                k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
                if use_cache:
                    self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip}
            elif context.shape[1] == self.agent_state_context_len + self.text_context_len + self.video_length:
                context_agent_state = context[:, :self.agent_state_context_len, :]
                context_ins = context[:, self.agent_state_context_len:self.agent_state_context_len+self.text_context_len, :]
                context_image = context[:, self.agent_state_context_len+self.text_context_len:, :]
-                k = self.to_k(context_ins)
+                if self._kv_fused:
-                v = self.to_v(context_ins)
+                    k, v = self.to_kv(context_ins).chunk(2, dim=-1)
-                k_ip = self.to_k_ip(context_image)
+                    k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
-                v_ip = self.to_v_ip(context_image)
+                    k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
-                k_as = self.to_k_as(context_agent_state)
+                else:
-                v_as = self.to_v_as(context_agent_state)
+                    k = self.to_k(context_ins)
                    v = self.to_v(context_ins)
                    k_ip = self.to_k_ip(context_image)
                    v_ip = self.to_v_ip(context_image)
                    k_as = self.to_k_as(context_agent_state)
                    v_as = self.to_v_as(context_agent_state)
                k, v = map(_reshape_kv, (k, v))
                k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
                k_as, v_as = map(_reshape_kv, (k_as, v_as))
                if use_cache:
                    self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip, 'k_as': k_as, 'v_as': v_as}
            else:
                context_agent_state = context[:, :self.agent_state_context_len, :]
                context_agent_action = context[:, self.agent_state_context_len:self.agent_state_context_len+self.agent_action_context_len, :]
                context_ins = context[:, self.agent_state_context_len+self.agent_action_context_len:self.agent_state_context_len+self.agent_action_context_len+self.text_context_len, :]
                context_image = context[:, self.agent_state_context_len+self.agent_action_context_len+self.text_context_len:, :]
-                k = self.to_k(context_ins)
+                if self._kv_fused:
-                v = self.to_v(context_ins)
+                    k, v = self.to_kv(context_ins).chunk(2, dim=-1)
-                k_ip = self.to_k_ip(context_image)
+                    k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
-                v_ip = self.to_v_ip(context_image)
+                    k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
-                k_as = self.to_k_as(context_agent_state)
+                    k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
-                v_as = self.to_v_as(context_agent_state)
+                else:
-                k_aa = self.to_k_aa(context_agent_action)
+                    k = self.to_k(context_ins)
-                v_aa = self.to_v_aa(context_agent_action)
+                    v = self.to_v(context_ins)
                    k_ip = self.to_k_ip(context_image)
                    v_ip = self.to_v_ip(context_image)
                    k_as = self.to_k_as(context_agent_state)
                    v_as = self.to_v_as(context_agent_state)
                    k_aa = self.to_k_aa(context_agent_action)
                    v_aa = self.to_v_aa(context_agent_action)
-                attn_mask_aa = self._get_attn_mask_aa(x.shape[0],
+                k, v = map(_reshape_kv, (k, v))
-                                                      q.shape[1],
+                k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
-                                                      k_aa.shape[1],
+                k_as, v_as = map(_reshape_kv, (k_as, v_as))
-                                                      block_size=16).to(k_aa.device)
+                k_aa, v_aa = map(_reshape_kv, (k_aa, v_aa))
                attn_mask_aa_raw = self._get_attn_mask_aa(x.shape[0],
                                                          q.shape[1],
                                                          k_aa.shape[1],
                                                          block_size=16,
                                                          device=k_aa.device)
                attn_mask_aa = attn_mask_aa_raw.unsqueeze(1).repeat(1, h, 1, 1).reshape(
                    b * h, attn_mask_aa_raw.shape[1], attn_mask_aa_raw.shape[2]).to(q.dtype)
                if use_cache:
                    self._kv_cache = {
                        'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip,
                        'k_as': k_as, 'v_as': v_as, 'k_aa': k_aa, 'v_aa': v_aa,
                        'attn_mask_aa': attn_mask_aa,
                    }
        else:
            if not spatial_self_attn:
                assert 1 > 2, ">>> ERROR: you should never go into here ..."
                context = context[:, :self.text_context_len, :]
-            k = self.to_k(context)
+            if self._kv_fused:
-            v = self.to_v(context)
+                k, v = self.to_kv(context).chunk(2, dim=-1)
-
+            else:
-        b, _, _ = q.shape
+                k = self.to_k(context)
-        q = q.unsqueeze(3).reshape(b, q.shape[1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(b * self.heads, q.shape[1], self.dim_head).contiguous()
+                v = self.to_v(context)
            k, v = map(_reshape_kv, (k, v))
            if use_cache:
                self._kv_cache = {'k': k, 'v': v}
        if k is not None:
            k, v = map(
                lambda t: t.unsqueeze(3).reshape(b, t.shape[
                    1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
                        b * self.heads, t.shape[1], self.dim_head).contiguous(),
                (k, v),
            )
            out = xformers.ops.memory_efficient_attention(q,
                                                          k,
                                                          v,
                                                          attn_bias=None,
                                                          op=None)
            out = (out.unsqueeze(0).reshape(
-                b, self.heads, out.shape[1],
+                b, h, out.shape[1],
                self.dim_head).permute(0, 2, 1,
                                       3).reshape(b, out.shape[1],
-                                                  self.heads * self.dim_head))
+                                                  h * self.dim_head))
        if k_ip is not None:
            # For image cross-attention
            k_ip, v_ip = map(
                lambda t: t.unsqueeze(3).reshape(b, t.shape[
                    1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
                        b * self.heads, t.shape[1], self.dim_head).contiguous(
                        ),
                (k_ip, v_ip),
            )
            out_ip = xformers.ops.memory_efficient_attention(q,
                                                             k_ip,
                                                             v_ip,
                                                             attn_bias=None,
                                                             op=None)
            out_ip = (out_ip.unsqueeze(0).reshape(
-                b, self.heads, out_ip.shape[1],
+                b, h, out_ip.shape[1],
                self.dim_head).permute(0, 2, 1,
                                       3).reshape(b, out_ip.shape[1],
-                                                  self.heads * self.dim_head))
+                                                  h * self.dim_head))
        if k_as is not None:
            # For agent state cross-attention
            k_as, v_as = map(
                lambda t: t.unsqueeze(3).reshape(b, t.shape[
                    1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
                        b * self.heads, t.shape[1], self.dim_head).contiguous(
                        ),
                (k_as, v_as),
            )
            out_as = xformers.ops.memory_efficient_attention(q,
                                                             k_as,
                                                             v_as,
                                                             attn_bias=None,
                                                             op=None)
            out_as = (out_as.unsqueeze(0).reshape(
-                b, self.heads, out_as.shape[1],
+                b, h, out_as.shape[1],
                self.dim_head).permute(0, 2, 1,
                                       3).reshape(b, out_as.shape[1],
-                                                  self.heads * self.dim_head))
+                                                  h * self.dim_head))
        if k_aa is not None:
            # For agent action cross-attention
            k_aa, v_aa = map(
                lambda t: t.unsqueeze(3).reshape(b, t.shape[
                    1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
                        b * self.heads, t.shape[1], self.dim_head).contiguous(
                        ),
                (k_aa, v_aa),
            )
            attn_mask_aa = attn_mask_aa.unsqueeze(1).repeat(1,self.heads,1,1).reshape(
                    b * self.heads, attn_mask_aa.shape[1], attn_mask_aa.shape[2])
            attn_mask_aa = attn_mask_aa.to(q.dtype)
            out_aa = xformers.ops.memory_efficient_attention(
                q, k_aa, v_aa, attn_bias=attn_mask_aa, op=None)
            out_aa = (out_aa.unsqueeze(0).reshape(
-                b, self.heads, out_aa.shape[1],
+                b, h, out_aa.shape[1],
                self.dim_head).permute(0, 2, 1,
                                       3).reshape(b, out_aa.shape[1],
-                                                  self.heads * self.dim_head))
+                                                  h * self.dim_head))
        if exists(mask):
            raise NotImplementedError
@@ -386,17 +448,43 @@ class CrossAttention(nn.Module):
        return self.to_out(out)
-    def _get_attn_mask_aa(self, b, l1, l2, block_size=16):
+    def _get_attn_mask_aa(self, b, l1, l2, block_size=16, device=None):
        cache_key = (b, l1, l2, block_size)
        if hasattr(self, '_attn_mask_aa_cache_key') and self._attn_mask_aa_cache_key == cache_key:
            cached = self._attn_mask_aa_cache
            if device is not None and cached.device != torch.device(device):
                cached = cached.to(device)
                self._attn_mask_aa_cache = cached
            return cached
        target_device = device if device is not None else 'cpu'
        num_token = l2 // block_size
-        start_positions = ((torch.arange(b) % block_size) + 1) * num_token
+        start_positions = ((torch.arange(b, device=target_device) % block_size) + 1) * num_token
-        col_indices = torch.arange(l2)
+        col_indices = torch.arange(l2, device=target_device)
        mask_2d = col_indices.unsqueeze(0) >= start_positions.unsqueeze(1)
        mask = mask_2d.unsqueeze(1).expand(b, l1, l2)
-        attn_mask = torch.zeros_like(mask, dtype=torch.float)
+        attn_mask = torch.zeros(b, l1, l2, dtype=torch.float, device=target_device)
        attn_mask[mask] = float('-inf')
        self._attn_mask_aa_cache_key = cache_key
        self._attn_mask_aa_cache = attn_mask
        return attn_mask
 def enable_cross_attn_kv_cache(module):
    for m in module.modules():
        if isinstance(m, CrossAttention):
            m._kv_cache_enabled = True
            m._kv_cache = {}
 def disable_cross_attn_kv_cache(module):
    for m in module.modules():
        if isinstance(m, CrossAttention):
            m._kv_cache_enabled = False
            m._kv_cache = {}
 class BasicTransformerBlock(nn.Module):
    def __init__(self,
--- a/src/unifolm_wma/modules/networks/wma_model.py
+++ b/src/unifolm_wma/modules/networks/wma_model.py
@@ -685,6 +685,21 @@ class WMAModel(nn.Module):
        self.action_token_projector = instantiate_from_config(
            stem_process_config)
        # Context precomputation cache
        self._ctx_cache_enabled = False
        self._ctx_cache = {}
        # Reusable CUDA stream for parallel state_unet / action_unet
        self._state_stream = torch.cuda.Stream()
    def __getstate__(self):
        state = self.__dict__.copy()
        state.pop('_state_stream', None)
        return state
    def __setstate__(self, state):
        self.__dict__.update(state)
        self._state_stream = torch.cuda.Stream()
    def forward(self,
                x: Tensor,
                x_action: Tensor,
@@ -720,58 +735,64 @@ class WMAModel(nn.Module):
                                   repeat_only=False).type(x.dtype)
        emb = self.time_embed(t_emb)
-        bt, l_context, _ = context.shape
+        _ctx_key = context.data_ptr()
-        if self.base_model_gen_only:
+        if self._ctx_cache_enabled and _ctx_key in self._ctx_cache:
-            assert l_context == 77 + self.n_obs_steps * 16, ">>> ERROR Context dim 1 ..."  ## NOTE HANDCODE
+            context = self._ctx_cache[_ctx_key]
        else:
-            if l_context == self.n_obs_steps + 77 + t * 16:
+            bt, l_context, _ = context.shape
-                context_agent_state = context[:, :self.n_obs_steps]
+            if self.base_model_gen_only:
-                context_text = context[:, self.n_obs_steps:self.n_obs_steps +
+                assert l_context == 77 + self.n_obs_steps * 16, ">>> ERROR Context dim 1 ..."  ## NOTE HANDCODE
-                                       77, :]
+            else:
-                context_img = context[:, self.n_obs_steps + 77:, :]
+                if l_context == self.n_obs_steps + 77 + t * 16:
-                context_agent_state = context_agent_state.repeat_interleave(
+                    context_agent_state = context[:, :self.n_obs_steps]
-                    repeats=t, dim=0)
+                    context_text = context[:, self.n_obs_steps:self.n_obs_steps +
-                context_text = context_text.repeat_interleave(repeats=t, dim=0)
+                                           77, :]
-                context_img = rearrange(context_img,
+                    context_img = context[:, self.n_obs_steps + 77:, :]
-                                        'b (t l) c -> (b t) l c',
+                    context_agent_state = context_agent_state.repeat_interleave(
-                                        t=t)
+                        repeats=t, dim=0)
-                context = torch.cat(
+                    context_text = context_text.repeat_interleave(repeats=t, dim=0)
-                    [context_agent_state, context_text, context_img], dim=1)
+                    context_img = rearrange(context_img,
-            elif l_context == self.n_obs_steps + 16 + 77 + t * 16:
+                                            'b (t l) c -> (b t) l c',
-                context_agent_state = context[:, :self.n_obs_steps]
+                                            t=t)
-                context_agent_action = context[:, self.
+                    context = torch.cat(
-                                               n_obs_steps:self.n_obs_steps +
+                        [context_agent_state, context_text, context_img], dim=1)
-                                               16, :]
+                elif l_context == self.n_obs_steps + 16 + 77 + t * 16:
-                context_agent_action = rearrange(
+                    context_agent_state = context[:, :self.n_obs_steps]
-                    context_agent_action.unsqueeze(2), 'b t l d -> (b t) l d')
+                    context_agent_action = context[:, self.
-                context_agent_action = self.action_token_projector(
+                                                   n_obs_steps:self.n_obs_steps +
-                    context_agent_action)
+                                                   16, :]
-                context_agent_action = rearrange(context_agent_action,
+                    context_agent_action = rearrange(
-                                                 '(b o) l d -> b o l d',
+                        context_agent_action.unsqueeze(2), 'b t l d -> (b t) l d')
-                                                 o=t)
+                    context_agent_action = self.action_token_projector(
-                context_agent_action = rearrange(context_agent_action,
+                        context_agent_action)
-                                                 'b o (t l) d -> b o t l d',
+                    context_agent_action = rearrange(context_agent_action,
-                                                 t=t)
+                                                     '(b o) l d -> b o l d',
-                context_agent_action = context_agent_action.permute(
+                                                     o=t)
-                    0, 2, 1, 3, 4)
+                    context_agent_action = rearrange(context_agent_action,
-                context_agent_action = rearrange(context_agent_action,
+                                                     'b o (t l) d -> b o t l d',
-                                                 'b t o l d -> (b t) (o l) d')
+                                                     t=t)
                    context_agent_action = context_agent_action.permute(
                        0, 2, 1, 3, 4)
                    context_agent_action = rearrange(context_agent_action,
                                                     'b t o l d -> (b t) (o l) d')
-                context_text = context[:, self.n_obs_steps +
+                    context_text = context[:, self.n_obs_steps +
-                                       16:self.n_obs_steps + 16 + 77, :]
+                                           16:self.n_obs_steps + 16 + 77, :]
-                context_text = context_text.repeat_interleave(repeats=t, dim=0)
+                    context_text = context_text.repeat_interleave(repeats=t, dim=0)
-                context_img = context[:, self.n_obs_steps + 16 + 77:, :]
+                    context_img = context[:, self.n_obs_steps + 16 + 77:, :]
-                context_img = rearrange(context_img,
+                    context_img = rearrange(context_img,
-                                        'b (t l) c -> (b t) l c',
+                                            'b (t l) c -> (b t) l c',
-                                        t=t)
+                                            t=t)
-                context_agent_state = context_agent_state.repeat_interleave(
+                    context_agent_state = context_agent_state.repeat_interleave(
-                    repeats=t, dim=0)
+                        repeats=t, dim=0)
-                context = torch.cat([
+                    context = torch.cat([
-                    context_agent_state, context_agent_action, context_text,
+                        context_agent_state, context_agent_action, context_text,
-                    context_img
+                        context_img
-                ],
+                    ],
-                                    dim=1)
+                                        dim=1)
            if self._ctx_cache_enabled:
                self._ctx_cache[_ctx_key] = context
        emb = emb.repeat_interleave(repeats=t, dim=0)
@@ -832,17 +853,45 @@ class WMAModel(nn.Module):
        if not self.base_model_gen_only:
            ba, _, _ = x_action.shape
            ts_state = timesteps[:ba] if b > 1 else timesteps
            # Run action_unet and state_unet in parallel via CUDA streams
            s_stream = self._state_stream
            s_stream.wait_stream(torch.cuda.current_stream())
            with torch.cuda.stream(s_stream):
                s_y = self.state_unet(x_state, ts_state, hs_a,
                                      context_action[:2], **kwargs)
            a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
                                   context_action[:2], **kwargs)
-            # Predict state
+            torch.cuda.current_stream().wait_stream(s_stream)
            if b > 1:
                s_y = self.state_unet(x_state, timesteps[:ba], hs_a,
                                      context_action[:2], **kwargs)
            else:
                s_y = self.state_unet(x_state, timesteps, hs_a,
                                      context_action[:2], **kwargs)
        else:
            a_y = torch.zeros_like(x_action)
            s_y = torch.zeros_like(x_state)
        return y, a_y, s_y
 def enable_ctx_cache(model):
    """Enable context precomputation cache on WMAModel and its action/state UNets."""
    for m in model.modules():
        if isinstance(m, WMAModel):
            m._ctx_cache_enabled = True
            m._ctx_cache = {}
    # conditional_unet1d cache
    from unifolm_wma.models.diffusion_head.conditional_unet1d import ConditionalUnet1D
    for m in model.modules():
        if isinstance(m, ConditionalUnet1D):
            m._global_cond_cache_enabled = True
            m._global_cond_cache = {}
 def disable_ctx_cache(model):
    """Disable and clear context precomputation cache."""
    for m in model.modules():
        if isinstance(m, WMAModel):
            m._ctx_cache_enabled = False
            m._ctx_cache = {}
    from unifolm_wma.models.diffusion_head.conditional_unet1d import ConditionalUnet1D
    for m in model.modules():
        if isinstance(m, ConditionalUnet1D):
            m._global_cond_cache_enabled = False
            m._global_cond_cache = {}
--- a/unitree_g1_pack_camera/case1/output.log
+++ b/unitree_g1_pack_camera/case1/output.log
@@ -0,0 +1,123 @@
 2026-02-11 19:14:09.599811: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 19:14:09.649058: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 19:14:09.649103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 19:14:09.650392: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 19:14:09.657857: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 19:14:10.584900: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:22<03:49, 22.92s/it]
 18%|█▊        | 2/11 [00:45<03:22, 22.52s/it]
 27%|██▋       | 3/11 [01:07<03:00, 22.52s/it]
 36%|███▋      | 4/11 [01:30<02:38, 22.60s/it]
 45%|████▌     | 5/11 [01:53<02:16, 22.70s/it]
 55%|█████▍    | 6/11 [02:16<01:53, 22.74s/it]
 64%|██████▎   | 7/11 [02:38<01:31, 22.76s/it]
 73%|███████▎  | 8/11 [03:01<01:08, 22.77s/it]
 82%|████████▏ | 9/11 [03:24<00:45, 22.76s/it]
 91%|█████████ | 10/11 [03:47<00:22, 22.76s/it]
 100%|██████████| 11/11 [04:09<00:00, 22.77s/it]
 100%|██████████| 11/11 [04:09<00:00, 22.73s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_g1_pack_camera/case1/psnr_result.json
+++ b/unitree_g1_pack_camera/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4",
    "pred_video": "unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4",
    "psnr": 32.340256576190384
 }
--- a/unitree_g1_pack_camera/case1/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case2/output.log
+++ b/unitree_g1_pack_camera/case2/output.log
@@ -0,0 +1,123 @@
 2026-02-11 17:41:30.163933: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:41:30.213409: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:41:30.213453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:41:30.214760: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:41:30.222233: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:41:31.146811: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:53, 23.40s/it]
 18%|█▊        | 2/11 [00:46<03:26, 23.00s/it]
 27%|██▋       | 3/11 [01:08<03:03, 22.93s/it]
 36%|███▋      | 4/11 [01:31<02:40, 22.88s/it]
 45%|████▌     | 5/11 [01:54<02:17, 22.86s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.84s/it]
 64%|██████▎   | 7/11 [02:40<01:31, 22.82s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.80s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.78s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.77s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.76s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_g1_pack_camera/case2/psnr_result.json
+++ b/unitree_g1_pack_camera/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case2/unitree_g1_pack_camera_case2.mp4",
    "pred_video": "unitree_g1_pack_camera/case2/output/inference/50_full_fs6.mp4",
    "psnr": 37.49178506869336
 }
--- a/unitree_g1_pack_camera/case2/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case3/output.log
+++ b/unitree_g1_pack_camera/case3/output.log
@@ -0,0 +1,123 @@
 2026-02-11 17:46:20.925463: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:46:20.976293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:46:20.976338: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:46:20.977650: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:46:20.985133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:46:21.909964: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:54, 23.50s/it]
 18%|█▊        | 2/11 [00:46<03:27, 23.07s/it]
 27%|██▋       | 3/11 [01:09<03:03, 22.99s/it]
 36%|███▋      | 4/11 [01:32<02:40, 22.94s/it]
 45%|████▌     | 5/11 [01:54<02:17, 22.90s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.87s/it]
 64%|██████▎   | 7/11 [02:40<01:31, 22.85s/it]
 73%|███████▎  | 8/11 [03:03<01:08, 22.83s/it]
 82%|████████▏ | 9/11 [03:26<00:45, 22.81s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.78s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.76s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.86s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_g1_pack_camera/case3/psnr_result.json
+++ b/unitree_g1_pack_camera/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case3/unitree_g1_pack_camera_case3.mp4",
    "pred_video": "unitree_g1_pack_camera/case3/output/inference/100_full_fs6.mp4",
    "psnr": 29.88155122131729
 }
--- a/unitree_g1_pack_camera/case3/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case4/output.log
+++ b/unitree_g1_pack_camera/case4/output.log
@@ -0,0 +1,123 @@
 2026-02-11 17:51:11.566934: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:51:11.616260: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:51:11.616305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:51:11.617626: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:51:11.625103: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:51:12.538539: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:53, 23.39s/it]
 18%|█▊        | 2/11 [00:46<03:26, 22.96s/it]
 27%|██▋       | 3/11 [01:08<03:03, 22.89s/it]
 36%|███▋      | 4/11 [01:31<02:40, 22.86s/it]
 45%|████▌     | 5/11 [01:54<02:16, 22.82s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.80s/it]
 64%|██████▎   | 7/11 [02:39<01:31, 22.77s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.75s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.73s/it]
 91%|█████████ | 10/11 [03:47<00:22, 22.72s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.73s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.79s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_g1_pack_camera/case4/psnr_result.json
+++ b/unitree_g1_pack_camera/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case4/unitree_g1_pack_camera_case4.mp4",
    "pred_video": "unitree_g1_pack_camera/case4/output/inference/200_full_fs6.mp4",
    "psnr": 35.62512454155058
 }
--- a/unitree_g1_pack_camera/case4/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -0,0 +1,114 @@
 2026-02-11 17:56:01.170137: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:56:01.219541: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:56:01.219584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:56:01.220897: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:56:01.228350: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:56:02.145344: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:23<02:43, 23.34s/it]
 25%|██▌       | 2/8 [00:46<02:17, 22.96s/it]
 38%|███▊      | 3/8 [01:08<01:54, 22.88s/it]
 50%|█████     | 4/8 [01:31<01:31, 22.82s/it]
 62%|██████▎   | 5/8 [01:54<01:08, 22.78s/it]
 75%|███████▌  | 6/8 [02:16<00:45, 22.76s/it]
 88%|████████▊ | 7/8 [02:39<00:22, 22.73s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.72s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.79s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case1/unitree_z1_dual_arm_cleanup_pencils_case1.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
    "psnr": 38.269577028444445
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/output.log
@@ -0,0 +1,114 @@
 2026-02-11 17:59:40.132715: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 17:59:40.183410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 17:59:40.183456: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 17:59:40.184784: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 17:59:40.192307: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 17:59:41.105025: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:23<02:43, 23.41s/it]
 25%|██▌       | 2/8 [00:46<02:18, 23.00s/it]
 38%|███▊      | 3/8 [01:08<01:54, 22.94s/it]
 50%|█████     | 4/8 [01:31<01:31, 22.86s/it]
 62%|██████▎   | 5/8 [01:54<01:08, 22.82s/it]
 75%|███████▌  | 6/8 [02:17<00:45, 22.78s/it]
 88%|████████▊ | 7/8 [02:39<00:22, 22.77s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.75s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case2/unitree_z1_dual_arm_cleanup_pencils_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case2/output/inference/50_full_fs4.mp4",
    "psnr": 44.38754096950435
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/output.log
@@ -0,0 +1,114 @@
 2026-02-11 18:03:19.373691: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:03:19.423144: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:03:19.423201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:03:19.424504: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:03:19.431968: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:03:20.342432: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:23<02:44, 23.45s/it]
 25%|██▌       | 2/8 [00:46<02:17, 22.99s/it]
 38%|███▊      | 3/8 [01:09<01:54, 22.94s/it]
 50%|█████     | 4/8 [01:31<01:31, 22.89s/it]
 62%|██████▎   | 5/8 [01:54<01:08, 22.84s/it]
 75%|███████▌  | 6/8 [02:17<00:45, 22.82s/it]
 88%|████████▊ | 7/8 [02:40<00:22, 22.81s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.79s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.86s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case3/unitree_z1_dual_arm_cleanup_pencils_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case3/output/inference/100_full_fs4.mp4",
    "psnr": 32.29959078097713
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/output.log
@@ -0,0 +1,114 @@
 2026-02-11 18:06:58.863806: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:06:58.913518: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:06:58.913565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:06:58.914918: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:06:58.922497: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:06:59.840461: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:23<02:44, 23.47s/it]
 25%|██▌       | 2/8 [00:46<02:18, 23.01s/it]
 38%|███▊      | 3/8 [01:09<01:54, 22.94s/it]
 50%|█████     | 4/8 [01:31<01:31, 22.89s/it]
 62%|██████▎   | 5/8 [01:54<01:08, 22.85s/it]
 75%|███████▌  | 6/8 [02:17<00:45, 22.81s/it]
 88%|████████▊ | 7/8 [02:40<00:22, 22.79s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.77s/it]
 100%|██████████| 8/8 [03:02<00:00, 22.85s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case4/unitree_z1_dual_arm_cleanup_pencils_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case4/output/inference/200_full_fs4.mp4",
    "psnr": 45.051241961122535
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox/case1/output.log
@@ -0,0 +1,111 @@
 2026-02-11 18:10:38.361867: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:10:38.412126: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:10:38.412182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:10:38.413493: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:10:38.420963: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:10:39.335981: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:23<02:20, 23.41s/it]
 29%|██▊       | 2/7 [00:46<01:54, 22.99s/it]
 43%|████▎     | 3/7 [01:08<01:31, 22.92s/it]
 57%|█████▋    | 4/7 [01:31<01:08, 22.88s/it]
 71%|███████▏  | 5/7 [01:54<00:45, 22.82s/it]
 86%|████████▌ | 6/7 [02:17<00:22, 22.79s/it]
 100%|██████████| 7/7 [02:39<00:00, 22.75s/it]
 100%|██████████| 7/7 [02:39<00:00, 22.84s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
--- a/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case1/unitree_z1_dual_arm_stackbox_case1.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case1/output/inference/5_full_fs4.mp4",
    "psnr": 42.717688631296596
 }
--- a/unitree_z1_dual_arm_stackbox/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case2/output.log
+++ b/unitree_z1_dual_arm_stackbox/case2/output.log
@@ -0,0 +1,111 @@
 2026-02-11 18:13:57.132827: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:13:57.182101: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:13:57.182156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:13:57.183471: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:13:57.190931: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:13:58.104923: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:23<02:20, 23.34s/it]
 29%|██▊       | 2/7 [00:46<01:54, 22.98s/it]
 43%|████▎     | 3/7 [01:08<01:31, 22.91s/it]
 57%|█████▋    | 4/7 [01:31<01:08, 22.87s/it]
 71%|███████▏  | 5/7 [01:54<00:45, 22.84s/it]
 86%|████████▌ | 6/7 [02:17<00:22, 22.80s/it]
 100%|██████████| 7/7 [02:39<00:00, 22.77s/it]
 100%|██████████| 7/7 [02:39<00:00, 22.84s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
--- a/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case2/unitree_z1_dual_arm_stackbox_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case2/output/inference/15_full_fs4.mp4",
    "psnr": 44.90750363879194
 }
--- a/unitree_z1_dual_arm_stackbox/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case3/output.log
+++ b/unitree_z1_dual_arm_stackbox/case3/output.log
@@ -0,0 +1,111 @@
 2026-02-11 18:17:16.023670: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:17:16.073206: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:17:16.073251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:17:16.074552: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:17:16.082033: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:17:16.997362: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:23<02:20, 23.41s/it]
 29%|██▊       | 2/7 [00:46<01:55, 23.03s/it]
 43%|████▎     | 3/7 [01:09<01:31, 22.95s/it]
 57%|█████▋    | 4/7 [01:31<01:08, 22.91s/it]
 71%|███████▏  | 5/7 [01:54<00:45, 22.87s/it]
 86%|████████▌ | 6/7 [02:17<00:22, 22.84s/it]
 100%|██████████| 7/7 [02:40<00:00, 22.82s/it]
 100%|██████████| 7/7 [02:40<00:00, 22.89s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
--- a/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case3/unitree_z1_dual_arm_stackbox_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case3/output/inference/25_full_fs4.mp4",
    "psnr": 39.63695040491171
 }
--- a/unitree_z1_dual_arm_stackbox/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case4/output.log
+++ b/unitree_z1_dual_arm_stackbox/case4/output.log
@@ -0,0 +1,111 @@
 2026-02-11 18:20:35.210324: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:20:35.259487: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:20:35.259530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:20:35.260816: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:20:35.268252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:20:36.181189: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:23<02:20, 23.43s/it]
 29%|██▊       | 2/7 [00:46<01:55, 23.03s/it]
 43%|████▎     | 3/7 [01:09<01:31, 22.96s/it]
 57%|█████▋    | 4/7 [01:31<01:08, 22.92s/it]
 71%|███████▏  | 5/7 [01:54<00:45, 22.89s/it]
 86%|████████▌ | 6/7 [02:17<00:22, 22.86s/it]
 100%|██████████| 7/7 [02:40<00:00, 22.84s/it]
 100%|██████████| 7/7 [02:40<00:00, 22.91s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
--- a/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case4/unitree_z1_dual_arm_stackbox_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case4/output/inference/35_full_fs4.mp4",
    "psnr": 42.34177660061245
 }
--- a/unitree_z1_dual_arm_stackbox/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log
@@ -0,0 +1,123 @@
 2026-02-11 18:23:54.635983: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:23:54.685542: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:23:54.685587: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:23:54.686907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:23:54.694405: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:23:55.620959: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:53, 23.38s/it]
 18%|█▊        | 2/11 [00:46<03:26, 22.96s/it]
 27%|██▋       | 3/11 [01:08<03:03, 22.91s/it]
 36%|███▋      | 4/11 [01:31<02:40, 22.86s/it]
 45%|████▌     | 5/11 [01:54<02:16, 22.83s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.80s/it]
 64%|██████▎   | 7/11 [02:39<01:31, 22.79s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.79s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.78s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.76s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.75s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.82s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
    "psnr": 26.683000215343522
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case2/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/output.log
@@ -0,0 +1,123 @@
 2026-02-11 18:28:48.801743: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:28:48.852069: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:28:48.852128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:28:48.853466: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:28:48.861133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:28:49.784354: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:56, 23.65s/it]
 18%|█▊        | 2/11 [00:46<03:28, 23.13s/it]
 27%|██▋       | 3/11 [01:09<03:04, 23.02s/it]
 36%|███▋      | 4/11 [01:32<02:40, 22.96s/it]
 45%|████▌     | 5/11 [01:55<02:17, 22.92s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.88s/it]
 64%|██████▎   | 7/11 [02:40<01:31, 22.84s/it]
 73%|███████▎  | 8/11 [03:03<01:08, 22.81s/it]
 82%|████████▏ | 9/11 [03:26<00:45, 22.81s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.80s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.80s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.88s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case2/unitree_z1_dual_arm_stackbox_v2_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case2/output/inference/15_full_fs4.mp4",
    "psnr": 27.46347145461597
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case3/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/output.log
@@ -0,0 +1,123 @@
 2026-02-11 18:33:43.119091: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:33:43.169099: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:33:43.169143: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:33:43.170444: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:33:43.177944: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:33:44.102499: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:53, 23.36s/it]
 18%|█▊        | 2/11 [00:46<03:26, 22.99s/it]
 27%|██▋       | 3/11 [01:08<03:03, 22.93s/it]
 36%|███▋      | 4/11 [01:31<02:40, 22.87s/it]
 45%|████▌     | 5/11 [01:54<02:17, 22.85s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.80s/it]
 64%|██████▎   | 7/11 [02:40<01:31, 22.79s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.78s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.76s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.74s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.73s/it]
 100%|██████████| 11/11 [04:10<00:00, 22.81s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case3/unitree_z1_dual_arm_stackbox_v2_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case3/output/inference/25_full_fs4.mp4",
    "psnr": 28.604047286947512
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case4/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/output.log
@@ -0,0 +1,123 @@
 2026-02-11 18:38:37.252690: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:38:37.301897: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:38:37.301950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:38:37.303254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:38:37.310679: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:38:38.237893: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:53, 23.38s/it]
 18%|█▊        | 2/11 [00:46<03:26, 22.99s/it]
 27%|██▋       | 3/11 [01:08<03:03, 22.91s/it]
 36%|███▋      | 4/11 [01:31<02:40, 22.86s/it]
 45%|████▌     | 5/11 [01:54<02:16, 22.83s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.82s/it]
 64%|██████▎   | 7/11 [02:40<01:31, 22.81s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.80s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.78s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.77s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.77s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case4/unitree_z1_dual_arm_stackbox_v2_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case4/output/inference/35_full_fs4.mp4",
    "psnr": 25.578498826379903
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case1/output.log
+++ b/unitree_z1_stackbox/case1/output.log
@@ -0,0 +1,126 @@
 2026-02-11 18:43:31.592464: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:43:31.641865: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:43:31.641908: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:43:31.643209: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:43:31.650663: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:43:32.564662: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:23<04:17, 23.41s/it]
 17%|█▋        | 2/12 [00:46<03:50, 23.03s/it]
 25%|██▌       | 3/12 [01:09<03:26, 22.97s/it]
 33%|███▎      | 4/12 [01:31<03:03, 22.92s/it]
 42%|████▏     | 5/12 [01:54<02:40, 22.88s/it]
 50%|█████     | 6/12 [02:17<02:17, 22.84s/it]
 58%|█████▊    | 7/12 [02:40<01:54, 22.80s/it]
 67%|██████▋   | 8/12 [03:02<01:31, 22.78s/it]
 75%|███████▌  | 9/12 [03:25<01:08, 22.78s/it]
 83%|████████▎ | 10/12 [03:48<00:45, 22.78s/it]
 92%|█████████▏| 11/12 [04:11<00:22, 22.77s/it]
 100%|██████████| 12/12 [04:34<00:00, 22.77s/it]
 100%|██████████| 12/12 [04:34<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 11: generating actions ...
 >>> Step 11: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
--- a/unitree_z1_stackbox/case1/psnr_result.json
+++ b/unitree_z1_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case1/unitree_z1_stackbox_case1.mp4",
    "pred_video": "unitree_z1_stackbox/case1/output/inference/5_full_fs4.mp4",
    "psnr": 46.05271283048069
 }
--- a/unitree_z1_stackbox/case1/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case2/output.log
+++ b/unitree_z1_stackbox/case2/output.log
@@ -0,0 +1,126 @@
 2026-02-11 18:48:44.235405: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:48:44.285138: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:48:44.285181: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:48:44.286531: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:48:44.294141: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:48:45.209453: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:23<04:17, 23.37s/it]
 17%|█▋        | 2/12 [00:46<03:49, 22.97s/it]
 25%|██▌       | 3/12 [01:08<03:26, 22.91s/it]
 33%|███▎      | 4/12 [01:31<03:02, 22.86s/it]
 42%|████▏     | 5/12 [01:54<02:39, 22.82s/it]
 50%|█████     | 6/12 [02:17<02:16, 22.81s/it]
 58%|█████▊    | 7/12 [02:39<01:53, 22.79s/it]
 67%|██████▋   | 8/12 [03:02<01:31, 22.78s/it]
 75%|███████▌  | 9/12 [03:25<01:08, 22.76s/it]
 83%|████████▎ | 10/12 [03:48<00:45, 22.75s/it]
 92%|█████████▏| 11/12 [04:10<00:22, 22.74s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.72s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.80s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 11: generating actions ...
 >>> Step 11: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
--- a/unitree_z1_stackbox/case2/psnr_result.json
+++ b/unitree_z1_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case2/unitree_z1_stackbox_case2.mp4",
    "pred_video": "unitree_z1_stackbox/case2/output/inference/15_full_fs4.mp4",
    "psnr": 38.94694381287429
 }
--- a/unitree_z1_stackbox/case2/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case3/output.log
+++ b/unitree_z1_stackbox/case3/output.log
@@ -0,0 +1,126 @@
 2026-02-11 18:53:57.068615: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:53:57.118271: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:53:57.118312: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:53:57.119665: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:53:57.127266: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:53:58.042116: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:23<04:17, 23.43s/it]
 17%|█▋        | 2/12 [00:46<03:50, 23.02s/it]
 25%|██▌       | 3/12 [01:09<03:26, 22.96s/it]
 33%|███▎      | 4/12 [01:31<03:03, 22.92s/it]
 42%|████▏     | 5/12 [01:54<02:40, 22.87s/it]
 50%|█████     | 6/12 [02:17<02:17, 22.85s/it]
 58%|█████▊    | 7/12 [02:40<01:54, 22.83s/it]
 67%|██████▋   | 8/12 [03:03<01:31, 22.80s/it]
 75%|███████▌  | 9/12 [03:25<01:08, 22.78s/it]
 83%|████████▎ | 10/12 [03:48<00:45, 22.77s/it]
 92%|█████████▏| 11/12 [04:11<00:22, 22.76s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.75s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 11: generating actions ...
 >>> Step 11: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
--- a/unitree_z1_stackbox/case3/psnr_result.json
+++ b/unitree_z1_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case3/unitree_z1_stackbox_case3.mp4",
    "pred_video": "unitree_z1_stackbox/case3/output/inference/25_full_fs4.mp4",
    "psnr": 49.489774674892764
 }
--- a/unitree_z1_stackbox/case3/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case4/output.log
+++ b/unitree_z1_stackbox/case4/output.log
@@ -0,0 +1,126 @@
 2026-02-11 18:59:09.688302: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 18:59:09.737473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 18:59:09.737518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 18:59:09.738835: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 18:59:09.746322: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 18:59:10.660940: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:23<04:17, 23.39s/it]
 17%|█▋        | 2/12 [00:46<03:50, 23.01s/it]
 25%|██▌       | 3/12 [01:09<03:26, 22.96s/it]
 33%|███▎      | 4/12 [01:31<03:03, 22.92s/it]
 42%|████▏     | 5/12 [01:54<02:40, 22.86s/it]
 50%|█████     | 6/12 [02:17<02:16, 22.82s/it]
 58%|█████▊    | 7/12 [02:40<01:53, 22.79s/it]
 67%|██████▋   | 8/12 [03:02<01:31, 22.77s/it]
 75%|███████▌  | 9/12 [03:25<01:08, 22.77s/it]
 83%|████████▎ | 10/12 [03:48<00:45, 22.78s/it]
 92%|█████████▏| 11/12 [04:11<00:22, 22.77s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.77s/it]
 100%|██████████| 12/12 [04:33<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 9: generating actions ...
 >>> Step 9: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 10: generating actions ...
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 11: generating actions ...
 >>> Step 11: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
--- a/unitree_z1_stackbox/case4/psnr_result.json
+++ b/unitree_z1_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case4/unitree_z1_stackbox_case4.mp4",
    "pred_video": "unitree_z1_stackbox/case4/output/inference/35_full_fs4.mp4",
    "psnr": 47.18724378194084
 }
--- a/unitree_z1_stackbox/case4/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
Author	SHA1	Message	Date
qhy	ef56e5dcdb	Revert "tensorRT engines尝试精度没过，暂时先提交代码，后续再继续调试" This reverts commit `e1f8a83648`.	2026-02-19 20:22:19 +08:00
qhy	e1f8a83648	tensorRT engines尝试精度没过，暂时先提交代码，后续再继续调试	2026-02-18 18:22:12 +08:00
qhy	5e0e21d91b	复原sh为原始版本	2026-02-18 14:11:55 +08:00
qhy	d5bec53f61	优化后的全部结果	2026-02-11 19:21:06 +08:00
qhy	508b91f5a2	延迟 decode，只解码 CLIP 需要的 1 帧 - world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 psnr微弱下降	2026-02-11 17:07:33 +08:00
qhy	3101252c25	速度变化不明显psnr显著提升	2026-02-11 16:38:21 +08:00
qhy	f386a5810b	补充上次提交	2026-02-11 16:24:40 +08:00
qhy	352a79035f	主干部分fp16,最敏感psnr=25.21,可以考虑对主干部分太敏感的部分回退fp32	2026-02-11 16:23:21 +08:00
qhy	9a08e27a19	KV 融合实现完成。改动总结：速度微弱提升psnr略微上升 attention.py — 3处改动： 1. __init__ 添加 _kv_fused = False 标志 2.新增 fuse_kv() 方法：将 to_k + to_v → to_kv，同时处理 _ip/_as/_aa 辅助 KV 对 2. bmm_forward 两个分支加_kv_fused 判断，用to_kv().chunk(2, dim=-1) 替代分别调用	2026-02-11 12:36:38 +08:00
qhy	b558856e1e	fix bugs	2026-02-10 22:35:45 +08:00
qhy	dcbcb2c377	- state_unet 放到一个独立的 CUDA stream 上执行 - action_unet 在默认 stream 上同时执行 - 用 wait_stream 确保两者都完成后再返回两个 1D UNet 输入完全独立，共享的 hs_a 和 context_action 都是只读的。GPU 利用率只有 ~31%，小张量 kernel 不会打满 GPU，两个 stream 可以真正并行。	2026-02-10 21:41:48 +08:00
qhy	ff43432ef9	结果	2026-02-10 20:01:25 +08:00
qhy	afa12ba031	每步迭代保存异步	2026-02-10 19:54:53 +08:00
qhy	bf4d66c874	跳过模型加载	2026-02-10 19:36:17 +08:00
qhy	9347a4ebe5	实现了Context 预计算和缓存功能，提升了采样效率。 psnr不下降	2026-02-10 17:47:46 +08:00
qhy	223a50f9e0	添加CrossAttention kv缓存，减少重复计算，提升性能，psnr=25.1201dB	2026-02-10 17:35:03 +08:00
qhy	2a6068f9e4	减少了一路视频vae解码	2026-02-10 17:13:45 +08:00
qhy	91a9b0febc	DDIM loop 内小张量分配优化，attention mask 缓存到 GPU	2026-02-10 16:53:00 +08:00
qhy	ed637c972b	tf32推理	2026-02-10 16:39:14 +08:00
		`@@ -0,0 +1,2 @@`
							`eval "$(conda shell.bash hook 2>/dev/null)"`
							`conda activate unifolm-wma`