延迟 decode，只解码 CLIP 需要的 1 帧

- world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 psnr微弱下降
速度变化不明显psnr显著提升
2026-02-11 17:07:33 +08:00 · 2026-02-11 16:38:21 +08:00 · 2026-02-11 16:24:40 +08:00 · 2026-02-11 16:23:21 +08:00
65 changed files with 4871 additions and 141 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -9,7 +9,12 @@
      "Bash(nvidia-smi:*)",
      "Bash(conda activate unifolm-wma)",
      "Bash(conda info:*)",
-      "Bash(direnv allow:*)"
+      "Bash(direnv allow:*)",
      "Bash(ls:*)",
      "Bash(for scenario in unitree_g1_pack_camera unitree_z1_dual_arm_cleanup_pencils unitree_z1_dual_arm_stackbox unitree_z1_dual_arm_stackbox_v2 unitree_z1_stackbox)",
      "Bash(do for case in case1 case2 case3 case4)",
      "Bash(done)",
      "Bash(chmod:*)"
    ]
  }
 }
--- a/run_all_case.sh
+++ b/run_all_case.sh
@@ -0,0 +1,114 @@
 #!/bin/bash
 # 自动执行所有场景的所有case
 # 总共5个场景，每个场景4个case，共20个case
 # 设置环境变量（离线模式）
 export HF_HUB_OFFLINE=1
 export TRANSFORMERS_OFFLINE=1
 # 颜色定义
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # 定义所有场景
 SCENARIOS=(
    "unitree_g1_pack_camera"
    "unitree_z1_dual_arm_cleanup_pencils"
    "unitree_z1_dual_arm_stackbox"
    "unitree_z1_dual_arm_stackbox_v2"
    "unitree_z1_stackbox"
 )
 # 定义case数量
 CASES=(1 2 3 4)
 # 记录开始时间
 START_TIME=$(date +%s)
 LOG_FILE="run_all_cases_$(date +%Y%m%d_%H%M%S).log"
 echo -e "${BLUE}========================================${NC}"
 echo -e "${BLUE}开始执行所有场景的case${NC}"
 echo -e "${BLUE}总共: ${#SCENARIOS[@]} 个场景 x ${#CASES[@]} 个case = $((${#SCENARIOS[@]} * ${#CASES[@]})) 个任务${NC}"
 echo -e "${BLUE}日志文件: ${LOG_FILE}${NC}"
 echo -e "${BLUE}========================================${NC}"
 echo ""
 # 初始化计数器
 TOTAL_CASES=$((${#SCENARIOS[@]} * ${#CASES[@]}))
 CURRENT_CASE=0
 SUCCESS_COUNT=0
 FAIL_COUNT=0
 # 记录失败的case
 declare -a FAILED_CASES
 # 遍历所有场景
 for scenario in "${SCENARIOS[@]}"; do
    echo -e "${YELLOW}>>> 场景: ${scenario}${NC}"
    # 遍历所有case
    for case_num in "${CASES[@]}"; do
        CURRENT_CASE=$((CURRENT_CASE + 1))
        case_dir="${scenario}/case${case_num}"
        script_path="${case_dir}/run_world_model_interaction.sh"
        echo -e "${BLUE}[${CURRENT_CASE}/${TOTAL_CASES}] 执行: ${case_dir}${NC}"
        # 检查脚本是否存在
        if [ ! -f "${script_path}" ]; then
            echo -e "${RED}错误: 脚本不存在 ${script_path}${NC}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            FAILED_CASES+=("${case_dir} (脚本不存在)")
            continue
        fi
        # 执行脚本
        echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
        if bash "${script_path}" >> "${LOG_FILE}" 2>&1; then
            echo -e "${GREEN}✓ 成功: ${case_dir}${NC}"
            SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
        else
            echo -e "${RED}✗ 失败: ${case_dir}${NC}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            FAILED_CASES+=("${case_dir}")
        fi
        echo "结束时间: $(date '+%Y-%m-%d %H:%M:%S')"
        echo ""
    done
    echo ""
 done
 # 计算总耗时
 END_TIME=$(date +%s)
 DURATION=$((END_TIME - START_TIME))
 HOURS=$((DURATION / 3600))
 MINUTES=$(((DURATION % 3600) / 60))
 SECONDS=$((DURATION % 60))
 # 输出总结
 echo -e "${BLUE}========================================${NC}"
 echo -e "${BLUE}执行完成！${NC}"
 echo -e "${BLUE}========================================${NC}"
 echo -e "总任务数: ${TOTAL_CASES}"
 echo -e "${GREEN}成功: ${SUCCESS_COUNT}${NC}"
 echo -e "${RED}失败: ${FAIL_COUNT}${NC}"
 echo -e "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
 echo -e "详细日志: ${LOG_FILE}"
 echo ""
 # 如果有失败的case，列出来
 if [ ${FAIL_COUNT} -gt 0 ]; then
    echo -e "${RED}失败的case列表:${NC}"
    for failed_case in "${FAILED_CASES[@]}"; do
        echo -e "${RED}  - ${failed_case}${NC}"
    done
    echo ""
 fi
 echo -e "${BLUE}========================================${NC}"
--- a/run_all_cases_20260211_135725.log
+++ b/run_all_cases_20260211_135725.log
--- a/run_all_psnr.sh
+++ b/run_all_psnr.sh
@@ -0,0 +1,61 @@
 #!/bin/bash
 set -e
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 cd "$SCRIPT_DIR"
 SCENARIOS=(
    unitree_g1_pack_camera
    unitree_z1_dual_arm_cleanup_pencils
    unitree_z1_dual_arm_stackbox
    unitree_z1_dual_arm_stackbox_v2
    unitree_z1_stackbox
 )
 CASES=(case1 case2 case3 case4)
 total=0
 success=0
 fail=0
 for scenario in "${SCENARIOS[@]}"; do
    for case in "${CASES[@]}"; do
        case_dir="${scenario}/${case}"
        gt_video="${case_dir}/${scenario}_${case}.mp4"
        pred_video=$(ls "${case_dir}"/output/inference/*_full_fs*.mp4 2>/dev/null | head -1)
        output_file="${case_dir}/psnr_result.json"
        total=$((total + 1))
        echo "=========================================="
        echo "[${total}/20] ${case_dir}"
        if [ ! -f "$gt_video" ]; then
            echo "  SKIP: GT video not found: $gt_video"
            fail=$((fail + 1))
            continue
        fi
        if [ -z "$pred_video" ]; then
            echo "  SKIP: pred video not found in ${case_dir}/output/inference/"
            fail=$((fail + 1))
            continue
        fi
        echo "  GT:   $gt_video"
        echo "  Pred: $pred_video"
        echo "  Out:  $output_file"
        if python3 psnr_score_for_challenge.py \
            --gt_video "$gt_video" \
            --pred_video "$pred_video" \
            --output_file "$output_file"; then
            success=$((success + 1))
            echo "  DONE"
        else
            fail=$((fail + 1))
            echo "  FAILED"
        fi
    done
 done
 echo "=========================================="
 echo "Finished: ${success} success, ${fail} fail, ${total} total"
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -450,6 +450,7 @@ def image_guided_synthesis_sim_mode(
    img = observation['observation.images.top'].permute(0, 2, 1, 3, 4)
    cond_img = rearrange(img, 'b o c h w -> (b o) c h w')[-1:]
    with torch.cuda.amp.autocast(dtype=torch.float16):
        cond_img_emb = model.embedder(cond_img)
        cond_img_emb = model.image_proj_model(cond_img_emb)
@@ -465,6 +466,7 @@ def image_guided_synthesis_sim_mode(
        prompts = [""] * batch_size
    cond_ins_emb = model.get_learned_conditioning(prompts)
    with torch.cuda.amp.autocast(dtype=torch.float16):
        cond_state_emb = model.state_projector(observation['observation.state'])
        cond_state_emb = cond_state_emb + model.agent_state_pos_emb
@@ -492,6 +494,7 @@ def image_guided_synthesis_sim_mode(
    cond_mask = None
    cond_z0 = None
    batch_variants = None
    samples = None
    if ddim_sampler is not None:
        samples, actions, states, intermedia = ddim_sampler.sample(
            S=ddim_steps,
@@ -515,7 +518,7 @@ def image_guided_synthesis_sim_mode(
            batch_images = model.decode_first_stage(samples)
            batch_variants = batch_images
-    return batch_variants, actions, states
+    return batch_variants, actions, states, samples
 def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
@@ -571,6 +574,22 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
        torch.save(model, prepared_path)
        print(f">>> Prepared model saved ({os.path.getsize(prepared_path) / 1024**3:.1f} GB).")
    # ---- FP16: convert diffusion backbone + conditioning modules ----
    model.model.to(torch.float16)
    model.model.diffusion_model.dtype = torch.float16
    print(">>> Diffusion backbone (model.model) converted to FP16.")
    # Projectors / MLP → FP16
    model.image_proj_model.half()
    model.state_projector.half()
    model.action_projector.half()
    print(">>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.")
    # Text/image encoders → FP16
    model.cond_stage_model.half()
    model.embedder.half()
    print(">>> Encoders (cond_stage_model, embedder) converted to FP16.")
    # Build normalizer (always needed, independent of model loading path)
    logging.info("***** Configing Data *****")
    data = instantiate_from_config(config.data)
@@ -630,7 +649,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
            sample_save_dir = f'{video_save_dir}/wm/{fs}'
            os.makedirs(sample_save_dir, exist_ok=True)
            # For collecting interaction videos
-            wm_video = []
+            wm_latent = []
            # Initialize observation queues
            cond_obs_queues = {
                "observation.images.top":
@@ -686,7 +705,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                # Use world-model in policy to generate action
                print(f'>>> Step {itr}: generating actions ...')
-                pred_videos_0, pred_actions, _ = image_guided_synthesis_sim_mode(
+                pred_videos_0, pred_actions, _, _ = image_guided_synthesis_sim_mode(
                    model,
                    sample['instruction'],
                    observation,
@@ -728,7 +747,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                # Interaction with the world-model
                print(f'>>> Step {itr}: interacting with world model ...')
-                pred_videos_1, _, pred_states = image_guided_synthesis_sim_mode(
+                pred_videos_1, _, pred_states, wm_samples = image_guided_synthesis_sim_mode(
                    model,
                    "",
                    observation,
@@ -741,12 +760,16 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    fs=model_input_fs,
                    text_input=False,
                    timestep_spacing=args.timestep_spacing,
-                    guidance_rescale=args.guidance_rescale)
+                    guidance_rescale=args.guidance_rescale,
                    decode_video=False)
                # Decode only the last frame for CLIP embedding in next iteration
                last_frame_pixel = model.decode_first_stage(wm_samples[:, :, -1:, :, :])
                for idx in range(args.exe_steps):
                    observation = {
                        'observation.images.top':
-                        pred_videos_1[0][:, idx:idx + 1].permute(1, 0, 2, 3),
+                        last_frame_pixel[0, :, 0:1].permute(1, 0, 2, 3),
                        'observation.state':
                        torch.zeros_like(pred_states[0][idx:idx + 1]) if
                        args.zero_pred_state else pred_states[0][idx:idx + 1],
@@ -764,30 +787,14 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                             pred_videos_0,
                                             sample_tag,
                                             fps=args.save_fps)
                # Save videos environment changes via world-model interaction
                sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
                log_to_tensorboard_async(writer,
                                         pred_videos_1,
                                         sample_tag,
                                         fps=args.save_fps)
                # Save the imagen videos for decision-making
                if pred_videos_0 is not None:
                    sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
                    save_results_async(pred_videos_0,
                                       sample_video_file,
                                       fps=args.save_fps)
                # Save videos environment changes via world-model interaction
                sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
                save_results_async(pred_videos_1,
                                   sample_video_file,
                                   fps=args.save_fps)
                print('>' * 24)
-                # Collect the result of world-model interactions
+                # Store raw latent for deferred decode
-                wm_video.append(pred_videos_1[:, :, :args.exe_steps].cpu())
+                wm_latent.append(wm_samples[:, :, :args.exe_steps].cpu())
-            full_video = torch.cat(wm_video, dim=2)
+            # Deferred decode: batch decode all stored latents
            full_latent = torch.cat(wm_latent, dim=2).to(device)
            full_video = model.decode_first_stage(full_latent).cpu()
            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
            log_to_tensorboard_async(writer,
                                     full_video,
--- a/src/unifolm_wma/models/ddpms.py
+++ b/src/unifolm_wma/models/ddpms.py
@@ -988,7 +988,7 @@ class LatentDiffusion(DDPM):
    def instantiate_cond_stage(self, config: OmegaConf) -> None:
        """
-        Build the conditioning stage model.
+        Build the conditioning stage model. Frozen models are converted to FP16.
        Args:
            config: OmegaConf config describing the conditioning model to instantiate.
@@ -1000,6 +1000,7 @@ class LatentDiffusion(DDPM):
            self.cond_stage_model.train = disabled_train
            for param in self.cond_stage_model.parameters():
                param.requires_grad = False
            self.cond_stage_model.half()
        else:
            model = instantiate_from_config(config)
            self.cond_stage_model = model
@@ -1014,6 +1015,7 @@ class LatentDiffusion(DDPM):
        Returns:
            Conditioning embedding as a tensor (shape depends on cond model).
        """
        with torch.cuda.amp.autocast(dtype=torch.float16):
            if self.cond_stage_forward is None:
                if hasattr(self.cond_stage_model, 'encode') and callable(
                        self.cond_stage_model.encode):
@@ -1957,6 +1959,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.image_proj_model.train = disabled_train
            for param in self.image_proj_model.parameters():
                param.requires_grad = False
            self.image_proj_model.half()
    def _init_embedder(self, config: OmegaConf, freeze: bool = True) -> None:
        """
@@ -1972,6 +1975,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            self.embedder.train = disabled_train
            for param in self.embedder.parameters():
                param.requires_grad = False
            self.embedder.half()
    def init_normalizers(self, normalize_config: OmegaConf,
                         dataset_stats: Mapping[str, Any]) -> None:
@@ -2175,6 +2179,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            (random_num < 3 * self.uncond_prob).float(), "n -> n 1 1 1")
        cond_img = input_mask * img
        with torch.cuda.amp.autocast(dtype=torch.float16):
            cond_img_emb = self.embedder(cond_img)
            cond_img_emb = self.image_proj_model(cond_img_emb)
@@ -2191,6 +2196,7 @@ class LatentVisualDiffusion(LatentDiffusion):
                                      repeat=z.shape[2])
            cond["c_concat"] = [img_cat_cond]
        with torch.cuda.amp.autocast(dtype=torch.float16):
            cond_action = self.action_projector(action)
            cond_action_emb = self.agent_action_pos_emb + cond_action
            # Get conditioning states
@@ -2457,7 +2463,17 @@ class DiffusionWrapper(pl.LightningModule):
        Returns:
            Output from the inner diffusion model (tensor or tuple, depending on the model).
        """
        with torch.cuda.amp.autocast(dtype=torch.float16):
            return self._forward_impl(x, x_action, x_state, t,
                                      c_concat, c_crossattn, c_crossattn_action,
                                      c_adm, s, mask, **kwargs)
    def _forward_impl(
        self,
        x, x_action, x_state, t,
        c_concat=None, c_crossattn=None, c_crossattn_action=None,
        c_adm=None, s=None, mask=None, **kwargs,
    ):
        if self.conditioning_key is None:
            out = self.diffusion_model(x, t)
        elif self.conditioning_key == 'concat':
--- a/unitree_g1_pack_camera/case1/output.log
+++ b/unitree_g1_pack_camera/case1/output.log
@@ -0,0 +1,120 @@
 2026-02-11 13:57:30.192579: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 13:57:30.242090: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 13:57:30.242134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 13:57:30.243443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 13:57:30.250963: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 13:57:31.177911: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:33<05:33, 33.38s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:07<05:02, 33.64s/it]
 27%|██▋       | 3/11 [01:41<04:31, 33.90s/it]
 36%|███▋      | 4/11 [02:15<03:58, 34.05s/it]
 45%|████▌     | 5/11 [02:49<03:24, 34.07s/it]
 55%|█████▍    | 6/11 [03:23<02:50, 34.08s/it]
 64%|██████▎   | 7/11 [03:58<02:16, 34.10s/it]
 73%|███████▎  | 8/11 [04:32<01:42, 34.12s/it]
 82%|████████▏ | 9/11 [05:06<01:08, 34.12s/it]
 91%|█████████ | 10/11 [05:40<00:34, 34.11s/it]
 100%|██████████| 11/11 [06:14<00:00, 34.10s/it]
 100%|██████████| 11/11 [06:14<00:00, 34.04s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case1/psnr_result.json
+++ b/unitree_g1_pack_camera/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4",
    "pred_video": "unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4",
    "psnr": 34.28787704598647
 }
--- a/unitree_g1_pack_camera/case1/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case2/output.log
+++ b/unitree_g1_pack_camera/case2/output.log
@@ -0,0 +1,120 @@
 2026-02-11 14:04:18.542839: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:04:18.593447: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:04:18.593492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:04:18.594810: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:04:18.602331: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:04:19.529518: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:41, 34.14s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:08, 34.26s/it]
 27%|██▋       | 3/11 [01:42<04:33, 34.23s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.23s/it]
 45%|████▌     | 5/11 [02:50<03:25, 34.18s/it]
 55%|█████▍    | 6/11 [03:25<02:50, 34.18s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.18s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.16s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.15s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.16s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.13s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.17s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case2/psnr_result.json
+++ b/unitree_g1_pack_camera/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case2/unitree_g1_pack_camera_case2.mp4",
    "pred_video": "unitree_g1_pack_camera/case2/output/inference/50_full_fs6.mp4",
    "psnr": 43.756296364111726
 }
--- a/unitree_g1_pack_camera/case2/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case3/output.log
+++ b/unitree_g1_pack_camera/case3/output.log
@@ -0,0 +1,120 @@
 2026-02-11 14:11:08.388455: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:11:08.437992: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:11:08.438037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:11:08.439358: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:11:08.446903: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:11:09.369764: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:40, 34.10s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:08, 34.25s/it]
 27%|██▋       | 3/11 [01:42<04:33, 34.25s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.22s/it]
 45%|████▌     | 5/11 [02:51<03:25, 34.20s/it]
 55%|█████▍    | 6/11 [03:25<02:50, 34.16s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.14s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.13s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.11s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.07s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.13s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case3/psnr_result.json
+++ b/unitree_g1_pack_camera/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case3/unitree_g1_pack_camera_case3.mp4",
    "pred_video": "unitree_g1_pack_camera/case3/output/inference/100_full_fs6.mp4",
    "psnr": 37.65161306938167
 }
--- a/unitree_g1_pack_camera/case3/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_g1_pack_camera/case4/output.log
+++ b/unitree_g1_pack_camera/case4/output.log
@@ -0,0 +1,120 @@
 2026-02-11 14:17:57.092085: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:17:57.141607: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:17:57.141661: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:17:57.142984: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:17:57.150517: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:17:58.074812: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:41, 34.20s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:08, 34.29s/it]
 27%|██▋       | 3/11 [01:42<04:34, 34.25s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.23s/it]
 45%|████▌     | 5/11 [02:51<03:25, 34.19s/it]
 55%|█████▍    | 6/11 [03:25<02:50, 34.17s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.19s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.18s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.16s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.13s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.11s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.17s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case4/psnr_result.json
+++ b/unitree_g1_pack_camera/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_g1_pack_camera/case4/unitree_g1_pack_camera_case4.mp4",
    "pred_video": "unitree_g1_pack_camera/case4/output/inference/200_full_fs6.mp4",
    "psnr": 33.205596596179475
 }
--- a/unitree_g1_pack_camera/case4/run_world_model_interaction.sh
+++ b/unitree_g1_pack_camera/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_g1_pack_camera"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -1,24 +1,13 @@
-2026-02-10 15:38:28.973314: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-11 14:24:46.595601: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-10 15:38:29.023024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-11 14:24:46.645554: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-10 15:38:29.023070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-11 14:24:46.645598: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-10 15:38:29.024393: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-11 14:24:46.646935: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-10 15:38:29.031901: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-11 14:24:46.654595: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-10 15:38:29.955454: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-11 14:24:47.580547: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
-INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
-INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
+>>> Prepared model loaded.
 INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
 AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.
 INFO:root:Loaded ViT-H-14 model config.
 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): hf-mirror.com:443
 DEBUG:urllib3.connectionpool:https://hf-mirror.com:443 "HEAD /laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin HTTP/1.1" 302 0
 INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
 INFO:root:Loaded ViT-H-14 model config.
 DEBUG:urllib3.connectionpool:https://hf-mirror.com:443 "HEAD /laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin HTTP/1.1" 302 0
 INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
 >>> model checkpoint loaded.
 >>> Load pre-trained model ...
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
@@ -36,13 +25,16 @@ INFO:root:***** Configing Data *****
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
-
+
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:34<03:58, 34.12s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
@@ -92,9 +84,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
-DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
 12%|█▎        | 1/8 [01:14<08:41, 74.51s/it]
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 25%|██▌       | 2/8 [01:08<03:24, 34.15s/it]
@@ -116,6 +106,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 5: generating actions ...
+>>> Step 5: generating actions ...
->>> Step 5: interacting with world model ...
+>>> Step 5: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
+>>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result.json
@@ -1,5 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case1/unitree_z1_dual_arm_cleanup_pencils_case1.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
-    "psnr": 47.911564449209735
+    "psnr": 48.52515070316814
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/output.log
@@ -0,0 +1,111 @@
 2026-02-11 14:29:51.911195: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:29:51.961101: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:29:51.961156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:29:51.962467: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:29:51.969980: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:29:52.899745: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:34<03:59, 34.22s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 25%|██▌       | 2/8 [01:08<03:25, 34.24s/it]
 38%|███▊      | 3/8 [01:42<02:51, 34.21s/it]
 50%|█████     | 4/8 [02:16<02:16, 34.18s/it]
 62%|██████▎   | 5/8 [02:50<01:42, 34.15s/it]
 75%|███████▌  | 6/8 [03:24<01:08, 34.11s/it]
 88%|████████▊ | 7/8 [03:58<00:34, 34.10s/it]
 100%|██████████| 8/8 [04:32<00:00, 34.07s/it]
 100%|██████████| 8/8 [04:32<00:00, 34.12s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case2/unitree_z1_dual_arm_cleanup_pencils_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case2/output/inference/50_full_fs4.mp4",
    "psnr": 47.91455867741451
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/output.log
@@ -0,0 +1,111 @@
 2026-02-11 14:34:58.016000: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:34:58.066369: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:34:58.066418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:34:58.067763: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:34:58.075447: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:34:59.008184: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:34<03:58, 34.14s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 25%|██▌       | 2/8 [01:08<03:25, 34.21s/it]
 38%|███▊      | 3/8 [01:42<02:50, 34.18s/it]
 50%|█████     | 4/8 [02:16<02:16, 34.16s/it]
 62%|██████▎   | 5/8 [02:50<01:42, 34.11s/it]
 75%|███████▌  | 6/8 [03:24<01:08, 34.08s/it]
 88%|████████▊ | 7/8 [03:58<00:34, 34.09s/it]
 100%|██████████| 8/8 [04:32<00:00, 34.08s/it]
 100%|██████████| 8/8 [04:32<00:00, 34.11s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case3/unitree_z1_dual_arm_cleanup_pencils_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case3/output/inference/100_full_fs4.mp4",
    "psnr": 41.260758562627046
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/output.log
@@ -0,0 +1,111 @@
 2026-02-11 14:40:06.781951: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:40:06.832600: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:40:06.832649: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:40:06.833982: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:40:06.841504: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:40:07.772162: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/8 [00:00<?, ?it/s]
 12%|█▎        | 1/8 [00:34<03:59, 34.18s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 25%|██▌       | 2/8 [01:08<03:25, 34.26s/it]
 38%|███▊      | 3/8 [01:42<02:51, 34.23s/it]
 50%|█████     | 4/8 [02:16<02:16, 34.20s/it]
 62%|██████▎   | 5/8 [02:50<01:42, 34.16s/it]
 75%|███████▌  | 6/8 [03:25<01:08, 34.14s/it]
 88%|████████▊ | 7/8 [03:59<00:34, 34.12s/it]
 100%|██████████| 8/8 [04:33<00:00, 34.12s/it]
 100%|██████████| 8/8 [04:33<00:00, 34.15s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case4/unitree_z1_dual_arm_cleanup_pencils_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case4/output/inference/200_full_fs4.mp4",
    "psnr": 47.046499351779815
 }
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
        --n_iter 8 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox/case1/output.log
@@ -0,0 +1,108 @@
 2026-02-11 14:45:16.672502: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:45:16.722666: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:45:16.722716: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:45:16.724025: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:45:16.731562: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:45:17.646917: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:34<03:24, 34.09s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 29%|██▊       | 2/7 [01:08<02:50, 34.15s/it]
 43%|████▎     | 3/7 [01:42<02:16, 34.13s/it]
 57%|█████▋    | 4/7 [02:16<01:42, 34.09s/it]
 71%|███████▏  | 5/7 [02:50<01:08, 34.09s/it]
 86%|████████▌ | 6/7 [03:24<00:34, 34.09s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.08s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.09s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
--- a/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case1/unitree_z1_dual_arm_stackbox_case1.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case1/output/inference/5_full_fs4.mp4",
    "psnr": 43.97044934749157
 }
--- a/unitree_z1_dual_arm_stackbox/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case2/output.log
+++ b/unitree_z1_dual_arm_stackbox/case2/output.log
@@ -0,0 +1,108 @@
 2026-02-11 14:49:54.901507: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:49:54.951975: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:49:54.952023: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:49:54.953338: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:49:54.960938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:49:55.888131: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:34<03:25, 34.18s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 29%|██▊       | 2/7 [01:08<02:51, 34.26s/it]
 43%|████▎     | 3/7 [01:42<02:16, 34.25s/it]
 57%|█████▋    | 4/7 [02:16<01:42, 34.22s/it]
 71%|███████▏  | 5/7 [02:50<01:08, 34.17s/it]
 86%|████████▌ | 6/7 [03:25<00:34, 34.13s/it]
 100%|██████████| 7/7 [03:59<00:00, 34.11s/it]
 100%|██████████| 7/7 [03:59<00:00, 34.16s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
--- a/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case2/unitree_z1_dual_arm_stackbox_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case2/output/inference/15_full_fs4.mp4",
    "psnr": 43.70773432165555
 }
--- a/unitree_z1_dual_arm_stackbox/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case3/output.log
+++ b/unitree_z1_dual_arm_stackbox/case3/output.log
@@ -0,0 +1,108 @@
 2026-02-11 14:54:33.079012: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:54:33.128851: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:54:33.128900: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:54:33.130229: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:54:33.137786: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:54:34.065218: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:34<03:25, 34.18s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 29%|██▊       | 2/7 [01:08<02:51, 34.24s/it]
 43%|████▎     | 3/7 [01:42<02:16, 34.25s/it]
 57%|█████▋    | 4/7 [02:16<01:42, 34.17s/it]
 71%|███████▏  | 5/7 [02:50<01:08, 34.13s/it]
 86%|████████▌ | 6/7 [03:24<00:34, 34.11s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.09s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.13s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
--- a/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case3/unitree_z1_dual_arm_stackbox_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case3/output/inference/25_full_fs4.mp4",
    "psnr": 48.68206289825236
 }
--- a/unitree_z1_dual_arm_stackbox/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox/case4/output.log
+++ b/unitree_z1_dual_arm_stackbox/case4/output.log
@@ -0,0 +1,108 @@
 2026-02-11 14:59:11.849320: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 14:59:11.899274: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 14:59:11.899322: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 14:59:11.900640: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 14:59:11.908158: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 14:59:12.830387: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/7 [00:00<?, ?it/s]
 14%|█▍        | 1/7 [00:34<03:24, 34.11s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 29%|██▊       | 2/7 [01:08<02:50, 34.17s/it]
 43%|████▎     | 3/7 [01:42<02:16, 34.13s/it]
 57%|█████▋    | 4/7 [02:16<01:42, 34.12s/it]
 71%|███████▏  | 5/7 [02:50<01:08, 34.07s/it]
 86%|████████▌ | 6/7 [03:24<00:34, 34.07s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.06s/it]
 100%|██████████| 7/7 [03:58<00:00, 34.09s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
--- a/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox/case4/unitree_z1_dual_arm_stackbox_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox/case4/output/inference/35_full_fs4.mp4",
    "psnr": 42.117165235043196
 }
--- a/unitree_z1_dual_arm_stackbox/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox"
        --n_iter 7 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log
@@ -1,13 +1,16 @@
-2026-02-11 11:59:27.241485: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-11 16:58:21.710140: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-11 11:59:27.291755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-11 16:58:21.759418: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-11 11:59:27.291807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-11 16:58:21.759461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-11 11:59:27.293169: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-11 16:58:21.760752: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-11 11:59:27.300838: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-11 16:58:21.768205: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-11 11:59:28.228009: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-11 16:58:22.691154: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 >>> Diffusion backbone (model.model) converted to FP16.
 >>> Projectors (image_proj_model, state_projector, action_projector) converted to FP16.
 >>> Encoders (cond_stage_model, embedder) converted to FP16.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
@@ -31,10 +34,40 @@ DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
-
+
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:23<03:52, 23.26s/it]
 18%|█▊        | 2/11 [00:45<03:25, 22.85s/it]
 27%|██▋       | 3/11 [01:08<03:02, 22.82s/it]
 36%|███▋      | 4/11 [01:31<02:39, 22.83s/it]
 45%|████▌     | 5/11 [01:54<02:17, 22.83s/it]
 55%|█████▍    | 6/11 [02:17<01:54, 22.83s/it]
 64%|██████▎   | 7/11 [02:39<01:31, 22.83s/it]
 73%|███████▎  | 8/11 [03:02<01:08, 22.83s/it]
 82%|████████▏ | 9/11 [03:25<00:45, 22.81s/it]
 91%|█████████ | 10/11 [03:48<00:22, 22.81s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.79s/it]
 100%|██████████| 11/11 [04:11<00:00, 22.83s/it]
 >>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
@@ -84,37 +117,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:07, 34.17s/it]
 27%|██▋       | 3/11 [01:42<04:33, 34.16s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.18s/it]
 45%|████▌     | 5/11 [02:50<03:24, 34.14s/it]
 55%|█████▍    | 6/11 [03:24<02:50, 34.10s/it]
 64%|██████▎   | 7/11 [03:58<02:16, 34.07s/it]
 73%|███████▎  | 8/11 [04:32<01:42, 34.03s/it]
 82%|████████▏ | 9/11 [05:06<01:08, 34.02s/it]
 91%|█████████ | 10/11 [05:40<00:34, 34.04s/it]
 100%|██████████| 11/11 [06:14<00:00, 34.03s/it]
 100%|██████████| 11/11 [06:14<00:00, 34.07s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 DEBUG:PIL.Image:Importing PpmImagePlugin
->>> Step 7: generating actions ...
+DEBUG:PIL.Image:Importing PsdImagePlugin
->>> Step 7: interacting with world model ...
+DEBUG:PIL.Image:Importing QoiImagePlugin
->>>>>>>>>>>>>>>>>>>>>>>>
+DEBUG:PIL.Image:Importing SgiImagePlugin
--- a/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
@@ -1,5 +1,5 @@
 {
-    "gt_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
+    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
-    "pred_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
-    "psnr": 28.167025381705358
+    "psnr": 26.683000215343522
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case2/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/output.log
@@ -0,0 +1,120 @@
 2026-02-11 15:10:45.687888: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:10:45.738006: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:10:45.738054: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:10:45.739410: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:10:45.747229: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:10:46.687896: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:41, 34.16s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:08, 34.26s/it]
 27%|██▋       | 3/11 [01:42<04:33, 34.20s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.16s/it]
 45%|████▌     | 5/11 [02:50<03:24, 34.15s/it]
 55%|█████▍    | 6/11 [03:24<02:50, 34.13s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.13s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.13s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.12s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.11s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.13s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case2/unitree_z1_dual_arm_stackbox_v2_case2.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case2/output/inference/15_full_fs4.mp4",
    "psnr": 33.945563782754554
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case2/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case3/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/output.log
@@ -0,0 +1,120 @@
 2026-02-11 15:17:41.661323: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:17:41.711317: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:17:41.711373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:17:41.712706: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:17:41.720248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:17:42.650151: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:41, 34.15s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:08, 34.25s/it]
 27%|██▋       | 3/11 [01:42<04:33, 34.23s/it]
 36%|███▋      | 4/11 [02:16<03:59, 34.18s/it]
 45%|████▌     | 5/11 [02:50<03:24, 34.17s/it]
 55%|█████▍    | 6/11 [03:25<02:50, 34.15s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.13s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.12s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.09s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.13s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case3/unitree_z1_dual_arm_stackbox_v2_case3.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case3/output/inference/25_full_fs4.mp4",
    "psnr": 31.86126241517472
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case3/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_dual_arm_stackbox_v2/case4/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/output.log
@@ -0,0 +1,120 @@
 2026-02-11 15:24:38.487806: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:24:38.538144: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:24:38.538200: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:24:38.539554: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:24:38.547185: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:24:39.470885: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/11 [00:00<?, ?it/s]
  9%|▉         | 1/11 [00:34<05:41, 34.18s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 18%|█▊        | 2/11 [01:08<05:09, 34.35s/it]
 27%|██▋       | 3/11 [01:42<04:34, 34.33s/it]
 36%|███▋      | 4/11 [02:17<03:59, 34.26s/it]
 45%|████▌     | 5/11 [02:51<03:25, 34.22s/it]
 55%|█████▍    | 6/11 [03:25<02:50, 34.18s/it]
 64%|██████▎   | 7/11 [03:59<02:16, 34.14s/it]
 73%|███████▎  | 8/11 [04:33<01:42, 34.13s/it]
 82%|████████▏ | 9/11 [05:07<01:08, 34.09s/it]
 91%|█████████ | 10/11 [05:41<00:34, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.09s/it]
 100%|██████████| 11/11 [06:15<00:00, 34.15s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case4/unitree_z1_dual_arm_stackbox_v2_case4.mp4",
    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case4/output/inference/35_full_fs4.mp4",
    "psnr": 39.90908062249536
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case4/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
        --n_iter 11 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case1/output.log
+++ b/unitree_z1_stackbox/case1/output.log
@@ -0,0 +1,123 @@
 2026-02-11 15:31:35.657972: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:31:35.707733: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:31:35.707792: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:31:35.709109: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:31:35.716616: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:31:36.648540: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:34<06:15, 34.17s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 17%|█▋        | 2/12 [01:08<05:42, 34.24s/it]
 25%|██▌       | 3/12 [01:42<05:07, 34.20s/it]
 33%|███▎      | 4/12 [02:16<04:33, 34.19s/it]
 42%|████▏     | 5/12 [02:50<03:58, 34.14s/it]
 50%|█████     | 6/12 [03:24<03:24, 34.09s/it]
 58%|█████▊    | 7/12 [03:58<02:50, 34.07s/it]
 67%|██████▋   | 8/12 [04:32<02:16, 34.07s/it]
 75%|███████▌  | 9/12 [05:06<01:42, 34.05s/it]
 83%|████████▎ | 10/12 [05:41<01:08, 34.06s/it]
 92%|█████████▏| 11/12 [06:15<00:34, 34.08s/it]
 100%|██████████| 12/12 [06:49<00:00, 34.07s/it]
 100%|██████████| 12/12 [06:49<00:00, 34.10s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
--- a/unitree_z1_stackbox/case1/psnr_result.json
+++ b/unitree_z1_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case1/unitree_z1_stackbox_case1.mp4",
    "pred_video": "unitree_z1_stackbox/case1/output/inference/5_full_fs4.mp4",
    "psnr": 49.42336701518203
 }
--- a/unitree_z1_stackbox/case1/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case2/output.log
+++ b/unitree_z1_stackbox/case2/output.log
@@ -0,0 +1,123 @@
 2026-02-11 15:39:01.409308: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:39:01.459136: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:39:01.459190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:39:01.460507: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:39:01.468019: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:39:02.395912: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:34<06:15, 34.10s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 17%|█▋        | 2/12 [01:08<05:41, 34.18s/it]
 25%|██▌       | 3/12 [01:42<05:07, 34.15s/it]
 33%|███▎      | 4/12 [02:16<04:33, 34.13s/it]
 42%|████▏     | 5/12 [02:50<03:58, 34.09s/it]
 50%|█████     | 6/12 [03:24<03:24, 34.08s/it]
 58%|█████▊    | 7/12 [03:58<02:50, 34.06s/it]
 67%|██████▋   | 8/12 [04:32<02:16, 34.03s/it]
 75%|███████▌  | 9/12 [05:06<01:42, 34.03s/it]
 83%|████████▎ | 10/12 [05:40<01:08, 34.03s/it]
 92%|█████████▏| 11/12 [06:14<00:34, 34.02s/it]
 100%|██████████| 12/12 [06:48<00:00, 34.00s/it]
 100%|██████████| 12/12 [06:48<00:00, 34.05s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
--- a/unitree_z1_stackbox/case2/psnr_result.json
+++ b/unitree_z1_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case2/unitree_z1_stackbox_case2.mp4",
    "pred_video": "unitree_z1_stackbox/case2/output/inference/15_full_fs4.mp4",
    "psnr": 48.88265200549669
 }
--- a/unitree_z1_stackbox/case2/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case2/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case3/output.log
+++ b/unitree_z1_stackbox/case3/output.log
@@ -0,0 +1,123 @@
 2026-02-11 15:46:27.304090: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:46:27.354074: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:46:27.354120: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:46:27.355468: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:46:27.363130: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:46:28.290783: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:34<06:15, 34.15s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 17%|█▋        | 2/12 [01:08<05:42, 34.24s/it]
 25%|██▌       | 3/12 [01:42<05:07, 34.22s/it]
 33%|███▎      | 4/12 [02:16<04:33, 34.16s/it]
 42%|████▏     | 5/12 [02:50<03:58, 34.11s/it]
 50%|█████     | 6/12 [03:24<03:24, 34.08s/it]
 58%|█████▊    | 7/12 [03:58<02:50, 34.04s/it]
 67%|██████▋   | 8/12 [04:32<02:16, 34.02s/it]
 75%|███████▌  | 9/12 [05:06<01:41, 33.99s/it]
 83%|████████▎ | 10/12 [05:40<01:07, 33.99s/it]
 92%|█████████▏| 11/12 [06:14<00:33, 33.97s/it]
 100%|██████████| 12/12 [06:48<00:00, 33.96s/it]
 100%|██████████| 12/12 [06:48<00:00, 34.04s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
--- a/unitree_z1_stackbox/case3/psnr_result.json
+++ b/unitree_z1_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case3/unitree_z1_stackbox_case3.mp4",
    "pred_video": "unitree_z1_stackbox/case3/output/inference/25_full_fs4.mp4",
    "psnr": 50.884297816906816
 }
--- a/unitree_z1_stackbox/case3/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case3/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
--- a/unitree_z1_stackbox/case4/output.log
+++ b/unitree_z1_stackbox/case4/output.log
@@ -0,0 +1,123 @@
 2026-02-11 15:53:52.504337: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
 2026-02-11 15:53:52.554351: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
 2026-02-11 15:53:52.554397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
 2026-02-11 15:53:52.555718: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
 2026-02-11 15:53:52.563252: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 2026-02-11 15:53:53.493343: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
 >>> unitree_z1_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox: normalizer initiated.
 >>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
 >>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
 >>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
 >>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
 >>> unitree_g1_pack_camera: 1 data samples loaded.
 >>> unitree_g1_pack_camera: data stats loaded.
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
  0%|          | 0/12 [00:00<?, ?it/s]
  8%|▊         | 1/12 [00:34<06:15, 34.14s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 DEBUG:PIL.Image:Importing BlpImagePlugin
 DEBUG:PIL.Image:Importing BmpImagePlugin
 DEBUG:PIL.Image:Importing BufrStubImagePlugin
 DEBUG:PIL.Image:Importing CurImagePlugin
 DEBUG:PIL.Image:Importing DcxImagePlugin
 DEBUG:PIL.Image:Importing DdsImagePlugin
 DEBUG:PIL.Image:Importing EpsImagePlugin
 DEBUG:PIL.Image:Importing FitsImagePlugin
 DEBUG:PIL.Image:Importing FitsStubImagePlugin
 DEBUG:PIL.Image:Importing FliImagePlugin
 DEBUG:PIL.Image:Importing FpxImagePlugin
 DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing FtexImagePlugin
 DEBUG:PIL.Image:Importing GbrImagePlugin
 DEBUG:PIL.Image:Importing GifImagePlugin
 DEBUG:PIL.Image:Importing GribStubImagePlugin
 DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
 DEBUG:PIL.Image:Importing IcnsImagePlugin
 DEBUG:PIL.Image:Importing IcoImagePlugin
 DEBUG:PIL.Image:Importing ImImagePlugin
 DEBUG:PIL.Image:Importing ImtImagePlugin
 DEBUG:PIL.Image:Importing IptcImagePlugin
 DEBUG:PIL.Image:Importing JpegImagePlugin
 DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
 DEBUG:PIL.Image:Importing McIdasImagePlugin
 DEBUG:PIL.Image:Importing MicImagePlugin
 DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
 DEBUG:PIL.Image:Importing MpegImagePlugin
 DEBUG:PIL.Image:Importing MpoImagePlugin
 DEBUG:PIL.Image:Importing MspImagePlugin
 DEBUG:PIL.Image:Importing PalmImagePlugin
 DEBUG:PIL.Image:Importing PcdImagePlugin
 DEBUG:PIL.Image:Importing PcxImagePlugin
 DEBUG:PIL.Image:Importing PdfImagePlugin
 DEBUG:PIL.Image:Importing PixarImagePlugin
 DEBUG:PIL.Image:Importing PngImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
 DEBUG:PIL.Image:Importing SgiImagePlugin
 DEBUG:PIL.Image:Importing SpiderImagePlugin
 DEBUG:PIL.Image:Importing SunImagePlugin
 DEBUG:PIL.Image:Importing TgaImagePlugin
 DEBUG:PIL.Image:Importing TiffImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
 17%|█▋        | 2/12 [01:08<05:41, 34.19s/it]
 25%|██▌       | 3/12 [01:42<05:07, 34.15s/it]
 33%|███▎      | 4/12 [02:16<04:32, 34.12s/it]
 42%|████▏     | 5/12 [02:50<03:58, 34.09s/it]
 50%|█████     | 6/12 [03:24<03:24, 34.05s/it]
 58%|█████▊    | 7/12 [03:58<02:50, 34.03s/it]
 67%|██████▋   | 8/12 [04:32<02:16, 34.04s/it]
 75%|███████▌  | 9/12 [05:06<01:42, 34.03s/it]
 83%|████████▎ | 10/12 [05:40<01:08, 34.01s/it]
 92%|█████████▏| 11/12 [06:14<00:34, 34.03s/it]
 100%|██████████| 12/12 [06:48<00:00, 34.02s/it]
 100%|██████████| 12/12 [06:48<00:00, 34.05s/it]
 >>> Step 1: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 2: generating actions ...
 >>> Step 2: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 3: generating actions ...
 >>> Step 3: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 4: generating actions ...
 >>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 7: generating actions ...
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 8: generating actions ...
 >>> Step 8: interacting with world model ...
--- a/unitree_z1_stackbox/case4/psnr_result.json
+++ b/unitree_z1_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
 {
    "gt_video": "unitree_z1_stackbox/case4/unitree_z1_stackbox_case4.mp4",
    "pred_video": "unitree_z1_stackbox/case4/output/inference/35_full_fs4.mp4",
    "psnr": 47.85197517791449
 }
--- a/unitree_z1_stackbox/case4/run_world_model_interaction.sh
+++ b/unitree_z1_stackbox/case4/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_stackbox"
        --n_iter 12 \
        --timestep_spacing 'uniform_trailing' \
        --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"
Author	SHA1	Message	Date
qhy	508b91f5a2	延迟 decode，只解码 CLIP 需要的 1 帧 - world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 psnr微弱下降	2026-02-11 17:07:33 +08:00
qhy	3101252c25	速度变化不明显psnr显著提升	2026-02-11 16:38:21 +08:00
qhy	f386a5810b	补充上次提交	2026-02-11 16:24:40 +08:00
qhy	352a79035f	主干部分fp16,最敏感psnr=25.21,可以考虑对主干部分太敏感的部分回退fp32	2026-02-11 16:23:21 +08:00