成功的尝试

2026-02-18 19:14:55 +08:00
parent 9a08e27a19
commit 65788be1b3
32 changed files with 1772 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -132,3 +132,5 @@ Experiment/log

 *.0
 ckpts/unifolm_wma_dual.ckpt.prepared.pt
+trt_engines/video_backbone.engine
+trt_engines/video_backbone.onnx
--- a/run_all_case.sh
+++ b/run_all_case.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# 自动执行所有场景的所有case
+# 总共5个场景，每个场景4个case，共20个case
+# 设置环境变量（离线模式）
+export HF_HUB_OFFLINE=1
+export TRANSFORMERS_OFFLINE=1
+
+# 颜色定义
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# 定义所有场景
+SCENARIOS=(
+    "unitree_g1_pack_camera"
+    "unitree_z1_dual_arm_cleanup_pencils"
+    "unitree_z1_dual_arm_stackbox"
+    "unitree_z1_dual_arm_stackbox_v2"
+    "unitree_z1_stackbox"
+)
+
+# 定义case数量
+CASES=(1 2 3 4)
+
+# 记录开始时间
+START_TIME=$(date +%s)
+LOG_FILE="run_all_cases_$(date +%Y%m%d_%H%M%S).log"
+
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}开始执行所有场景的case${NC}"
+echo -e "${BLUE}总共: ${#SCENARIOS[@]} 个场景 x ${#CASES[@]} 个case = $((${#SCENARIOS[@]} * ${#CASES[@]})) 个任务${NC}"
+echo -e "${BLUE}日志文件: ${LOG_FILE}${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo ""
+
+# 初始化计数器
+TOTAL_CASES=$((${#SCENARIOS[@]} * ${#CASES[@]}))
+CURRENT_CASE=0
+SUCCESS_COUNT=0
+FAIL_COUNT=0
+
+# 记录失败的case
+declare -a FAILED_CASES
+
+# 遍历所有场景
+for scenario in "${SCENARIOS[@]}"; do
+    echo -e "${YELLOW}>>> 场景: ${scenario}${NC}"
+
+    # 遍历所有case
+    for case_num in "${CASES[@]}"; do
+        CURRENT_CASE=$((CURRENT_CASE + 1))
+        case_dir="${scenario}/case${case_num}"
+        script_path="${case_dir}/run_world_model_interaction.sh"
+
+        echo -e "${BLUE}[${CURRENT_CASE}/${TOTAL_CASES}] 执行: ${case_dir}${NC}"
+
+        # 检查脚本是否存在
+        if [ ! -f "${script_path}" ]; then
+            echo -e "${RED}错误: 脚本不存在 ${script_path}${NC}"
+            FAIL_COUNT=$((FAIL_COUNT + 1))
+            FAILED_CASES+=("${case_dir} (脚本不存在)")
+            continue
+        fi
+
+        # 执行脚本
+        echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
+
+        if bash "${script_path}" >> "${LOG_FILE}" 2>&1; then
+            echo -e "${GREEN}✓ 成功: ${case_dir}${NC}"
+            SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
+        else
+            echo -e "${RED}✗ 失败: ${case_dir}${NC}"
+            FAIL_COUNT=$((FAIL_COUNT + 1))
+            FAILED_CASES+=("${case_dir}")
+        fi
+
+        echo "结束时间: $(date '+%Y-%m-%d %H:%M:%S')"
+        echo ""
+    done
+
+    echo ""
+done
+
+# 计算总耗时
+END_TIME=$(date +%s)
+DURATION=$((END_TIME - START_TIME))
+HOURS=$((DURATION / 3600))
+MINUTES=$(((DURATION % 3600) / 60))
+SECONDS=$((DURATION % 60))
+
+# 输出总结
+echo -e "${BLUE}========================================${NC}"
+echo -e "${BLUE}执行完成！${NC}"
+echo -e "${BLUE}========================================${NC}"
+echo -e "总任务数: ${TOTAL_CASES}"
+echo -e "${GREEN}成功: ${SUCCESS_COUNT}${NC}"
+echo -e "${RED}失败: ${FAIL_COUNT}${NC}"
+echo -e "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
+echo -e "详细日志: ${LOG_FILE}"
+echo ""
+
+# 如果有失败的case，列出来
+if [ ${FAIL_COUNT} -gt 0 ]; then
+    echo -e "${RED}失败的case列表:${NC}"
+    for failed_case in "${FAILED_CASES[@]}"; do
+        echo -e "${RED}  - ${failed_case}${NC}"
+    done
+    echo ""
+fi
+
+echo -e "${BLUE}========================================${NC}"
--- a/run_all_cases_20260218_190150.log
+++ b/run_all_cases_20260218_190150.log
@@ -0,0 +1,504 @@
+2026-02-18 19:01:56.891895: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:01:56.940243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:01:56.940285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:01:56.941395: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:01:56.948327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:01:57.870809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:02:10] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:17<02:51, 17.15s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+
+ 18%|█▊        | 2/11 [00:33<02:31, 16.87s/it]
+ 27%|██▋       | 3/11 [00:50<02:14, 16.76s/it]
+ 36%|███▋      | 4/11 [01:07<01:57, 16.81s/it]
+ 45%|████▌     | 5/11 [01:24<01:41, 16.85s/it]
+ 55%|█████▍    | 6/11 [01:41<01:24, 16.82s/it]
+ 64%|██████▎   | 7/11 [01:57<01:07, 16.82s/it]
+ 73%|███████▎  | 8/11 [02:14<00:50, 16.83s/it]
+ 82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
+ 91%|█████████ | 10/11 [02:48<00:16, 16.81s/it]
+100%|██████████| 11/11 [03:05<00:00, 16.81s/it]
+100%|██████████| 11/11 [03:05<00:00, 16.83s/it]
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 5: generating actions ...
+>>> Step 5: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 6: generating actions ...
+>>> Step 6: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 8: generating actions ...
+>>> Step 8: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 9: generating actions ...
+>>> Step 9: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 10: generating actions ...
+>>> Step 10: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+
+real	3m49.072s
+user	4m16.055s
+sys	0m44.636s
+2026-02-18 19:05:45.956647: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:05:46.004149: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:05:46.004193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:05:46.005265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:05:46.012074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:05:46.932966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:05:59] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:16<02:47, 16.71s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+
+ 18%|█▊        | 2/11 [00:33<02:30, 16.75s/it]
+ 27%|██▋       | 3/11 [00:50<02:15, 16.91s/it]
+ 36%|███▋      | 4/11 [01:07<01:59, 17.02s/it]
+ 45%|████▌     | 5/11 [01:24<01:41, 16.98s/it]
+ 55%|█████▍    | 6/11 [01:41<01:24, 16.94s/it]
+ 64%|██████▎   | 7/11 [01:58<01:07, 16.90s/it]
+ 73%|███████▎  | 8/11 [02:15<00:50, 16.83s/it]
+ 82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
+ 91%|█████████ | 10/11 [02:49<00:16, 16.94s/it]
+100%|██████████| 11/11 [03:06<00:00, 16.97s/it]
+100%|██████████| 11/11 [03:06<00:00, 16.91s/it]
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 5: generating actions ...
+>>> Step 5: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 6: generating actions ...
+>>> Step 6: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 8: generating actions ...
+>>> Step 8: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 9: generating actions ...
+>>> Step 9: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 10: generating actions ...
+>>> Step 10: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+
+real	3m49.162s
+user	4m12.814s
+sys	0m45.565s
+2026-02-18 19:09:35.113634: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:09:35.161428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:09:35.161474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:09:35.162551: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:09:35.169325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:09:36.089250: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:09:49] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:16<02:45, 16.53s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
--- a/run_all_psnr.sh
+++ b/run_all_psnr.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+cd "$SCRIPT_DIR"
+
+SCENARIOS=(
+    unitree_g1_pack_camera
+    unitree_z1_dual_arm_cleanup_pencils
+    unitree_z1_dual_arm_stackbox
+    unitree_z1_dual_arm_stackbox_v2
+    unitree_z1_stackbox
+)
+
+CASES=(case1 case2 case3 case4)
+
+total=0
+success=0
+fail=0
+
+for scenario in "${SCENARIOS[@]}"; do
+    for case in "${CASES[@]}"; do
+        case_dir="${scenario}/${case}"
+        gt_video="${case_dir}/${scenario}_${case}.mp4"
+        pred_video=$(ls "${case_dir}"/output/inference/*_full_fs*.mp4 2>/dev/null | head -1)
+        output_file="${case_dir}/psnr_result.json"
+
+        total=$((total + 1))
+        echo "=========================================="
+        echo "[${total}/20] ${case_dir}"
+
+        if [ ! -f "$gt_video" ]; then
+            echo "  SKIP: GT video not found: $gt_video"
+            fail=$((fail + 1))
+            continue
+        fi
+        if [ -z "$pred_video" ]; then
+            echo "  SKIP: pred video not found in ${case_dir}/output/inference/"
+            fail=$((fail + 1))
+            continue
+        fi
+
+        echo "  GT:   $gt_video"
+        echo "  Pred: $pred_video"
+        echo "  Out:  $output_file"
+
+        if python3 psnr_score_for_challenge.py \
+            --gt_video "$gt_video" \
+            --pred_video "$pred_video" \
+            --output_file "$output_file"; then
+            success=$((success + 1))
+            echo "  DONE"
+        else
+            fail=$((fail + 1))
+            echo "  FAILED"
+        fi
+    done
+done
+
+echo "=========================================="
+echo "Finished: ${success} success, ${fail} fail, ${total} total"
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -585,6 +585,11 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                   if isinstance(m, CrossAttention) and m.fuse_kv())
    print(f"    ✓ KV fused: {kv_count} attention layers")

+    # Load TRT backbone if engine exists
+    trt_engine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'trt_engines', 'video_backbone.engine')
+    if os.path.exists(trt_engine_path):
+        model.model.diffusion_model.load_trt_backbone(trt_engine_path)
+
    # Run over data
    assert (args.height % 16 == 0) and (
        args.width % 16
--- a/scripts/export_trt.py
+++ b/scripts/export_trt.py
@@ -0,0 +1,87 @@
+"""Export video UNet backbone to ONNX, then convert to TensorRT engine.
+
+Usage:
+    python scripts/export_trt.py \
+        --ckpt ckpts/unifolm_wma_dual.ckpt.prepared.pt \
+        --config configs/inference/world_model_interaction.yaml \
+        --out_dir trt_engines
+"""
+
+import os
+import sys
+import argparse
+
+import torch
+import tensorrt as trt
+from omegaconf import OmegaConf
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
+from unifolm_wma.utils.utils import instantiate_from_config
+from unifolm_wma.trt_utils import export_backbone_onnx
+
+
+def load_model(config_path, ckpt_path):
+    if ckpt_path.endswith('.prepared.pt'):
+        model = torch.load(ckpt_path, map_location='cpu')
+    else:
+        config = OmegaConf.load(config_path)
+        model = instantiate_from_config(config.model)
+        state_dict = torch.load(ckpt_path, map_location='cpu')
+        if 'state_dict' in state_dict:
+            state_dict = state_dict['state_dict']
+        model.load_state_dict(state_dict, strict=False)
+    model.eval().cuda()
+    return model
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--ckpt', required=True)
+    parser.add_argument('--config', default='configs/inference/world_model_interaction.yaml')
+    parser.add_argument('--out_dir', default='trt_engines')
+    parser.add_argument('--context_len', type=int, default=95)
+    parser.add_argument('--fp16', action='store_true', default=True)
+    args = parser.parse_args()
+
+    os.makedirs(args.out_dir, exist_ok=True)
+    onnx_path = os.path.join(args.out_dir, 'video_backbone.onnx')
+    engine_path = os.path.join(args.out_dir, 'video_backbone.engine')
+
+    if os.path.exists(onnx_path):
+        print(f">>> ONNX already exists at {onnx_path}, skipping export.")
+        n_outputs = 10
+    else:
+        print(">>> Loading model ...")
+        model = load_model(args.config, args.ckpt)
+        print(">>> Exporting ONNX ...")
+        with torch.no_grad():
+            n_outputs = export_backbone_onnx(model, onnx_path, context_len=args.context_len)
+        del model
+        torch.cuda.empty_cache()
+
+    print(">>> Converting ONNX -> TensorRT engine ...")
+    logger = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(logger)
+    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+    parser = trt.OnnxParser(network, logger)
+
+    if not parser.parse_from_file(os.path.abspath(onnx_path)):
+        for i in range(parser.num_errors):
+            print(f"    ONNX parse error: {parser.get_error(i)}")
+        raise RuntimeError("ONNX parsing failed")
+
+    config = builder.create_builder_config()
+    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 16 << 30)
+    if args.fp16:
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    engine_bytes = builder.build_serialized_network(network, config)
+    with open(engine_path, 'wb') as f:
+        f.write(engine_bytes)
+
+    print(f"\n>>> Done! Engine saved to {engine_path}")
+    print(f"    Outputs: 1 y + {n_outputs - 1} hs_a tensors")
+
+
+if __name__ == '__main__':
+    main()
--- a/src/unifolm_wma/modules/networks/wma_model.py
+++ b/src/unifolm_wma/modules/networks/wma_model.py
@@ -688,6 +688,7 @@ class WMAModel(nn.Module):
        # Context precomputation cache
        self._ctx_cache_enabled = False
        self._ctx_cache = {}
+        self._trt_backbone = None  # TRT engine for video UNet backbone
        # Reusable CUDA stream for parallel state_unet / action_unet
        self._state_stream = torch.cuda.Stream()

@@ -700,6 +701,12 @@ class WMAModel(nn.Module):
        self.__dict__.update(state)
        self._state_stream = torch.cuda.Stream()

+    def load_trt_backbone(self, engine_path, n_hs_a=9):
+        """Load a TensorRT engine for the video UNet backbone."""
+        from unifolm_wma.trt_utils import TRTBackbone
+        self._trt_backbone = TRTBackbone(engine_path, n_hs_a=n_hs_a)
+        print(f">>> TRT backbone loaded from {engine_path}")
+
    def forward(self,
                x: Tensor,
                x_action: Tensor,
@@ -812,44 +819,50 @@ class WMAModel(nn.Module):
            fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
            emb = emb + fs_embed

-        h = x.type(self.dtype)
-        adapter_idx = 0
-        hs = []
-        hs_a = []
-        for id, module in enumerate(self.input_blocks):
-            h = module(h, emb, context=context, batch_size=b)
-            if id == 0 and self.addition_attention:
-                h = self.init_attn(h, emb, context=context, batch_size=b)
-            # plug-in adapter features
-            if ((id + 1) % 3 == 0) and features_adapter is not None:
-                h = h + features_adapter[adapter_idx]
-                adapter_idx += 1
-            if id != 0:
-                if isinstance(module[0], Downsample):
+        if self._trt_backbone is not None:
+            # TRT path: run backbone via TensorRT engine
+            h_in = x.type(self.dtype).contiguous()
+            y, hs_a = self._trt_backbone(h_in, emb.contiguous(), context.contiguous())
+        else:
+            # PyTorch path: original backbone
+            h = x.type(self.dtype)
+            adapter_idx = 0
+            hs = []
+            hs_a = []
+            for id, module in enumerate(self.input_blocks):
+                h = module(h, emb, context=context, batch_size=b)
+                if id == 0 and self.addition_attention:
+                    h = self.init_attn(h, emb, context=context, batch_size=b)
+                # plug-in adapter features
+                if ((id + 1) % 3 == 0) and features_adapter is not None:
+                    h = h + features_adapter[adapter_idx]
+                    adapter_idx += 1
+                if id != 0:
+                    if isinstance(module[0], Downsample):
+                        hs_a.append(
+                            rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
+                hs.append(h)
+            hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+            if features_adapter is not None:
+                assert len(
+                    features_adapter) == adapter_idx, 'Wrong features_adapter'
+            h = self.middle_block(h, emb, context=context, batch_size=b)
+            hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+            hs_out = []
+            for module in self.output_blocks:
+                h = torch.cat([h, hs.pop()], dim=1)
+                h = module(h, emb, context=context, batch_size=b)
+                if isinstance(module[-1], Upsample):
                    hs_a.append(
-                        rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
-            hs.append(h)
-        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+                        rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
+                hs_out.append(h)
+            h = h.type(x.dtype)
+            hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))

-        if features_adapter is not None:
-            assert len(
-                features_adapter) == adapter_idx, 'Wrong features_adapter'
-        h = self.middle_block(h, emb, context=context, batch_size=b)
-        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
-
-        hs_out = []
-        for module in self.output_blocks:
-            h = torch.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context=context, batch_size=b)
-            if isinstance(module[-1], Upsample):
-                hs_a.append(
-                    rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
-            hs_out.append(h)
-        h = h.type(x.dtype)
-        hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
-
-        y = self.out(h)
-        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+            y = self.out(h)
+            y = rearrange(y, '(b t) c h w -> b c t h w', b=b)

        if not self.base_model_gen_only:
            ba, _, _ = x_action.shape
--- a/src/unifolm_wma/trt_utils.py
+++ b/src/unifolm_wma/trt_utils.py
@@ -0,0 +1,151 @@
+"""TensorRT acceleration utilities for the video UNet backbone."""
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from unifolm_wma.modules.networks.wma_model import Downsample, Upsample
+
+
+class VideoBackboneForExport(nn.Module):
+    """Wrapper that isolates the video UNet backbone for ONNX export.
+
+    Takes already-preprocessed inputs (after context/time embedding prep)
+    and returns y + hs_a as a flat tuple.
+    """
+
+    def __init__(self, wma_model):
+        super().__init__()
+        self.input_blocks = wma_model.input_blocks
+        self.middle_block = wma_model.middle_block
+        self.output_blocks = wma_model.output_blocks
+        self.out = wma_model.out
+        self.addition_attention = wma_model.addition_attention
+        if self.addition_attention:
+            self.init_attn = wma_model.init_attn
+        self.dtype = wma_model.dtype
+
+    def forward(self, h, emb, context):
+        t = 16
+        b = 1
+
+        hs = []
+        hs_a = []
+        h = h.type(self.dtype)
+        for id, module in enumerate(self.input_blocks):
+            h = module(h, emb, context=context, batch_size=b)
+            if id == 0 and self.addition_attention:
+                h = self.init_attn(h, emb, context=context, batch_size=b)
+            if id != 0:
+                if isinstance(module[0], Downsample):
+                    hs_a.append(rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
+            hs.append(h)
+        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+        h = self.middle_block(h, emb, context=context, batch_size=b)
+        hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
+
+        hs_out = []
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context=context, batch_size=b)
+            if isinstance(module[-1], Upsample):
+                hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
+            hs_out.append(h)
+        hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
+
+        y = self.out(h.type(h.dtype))
+        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+        return (y, *hs_a)
+
+
+def export_backbone_onnx(model, save_path, context_len=95):
+    wma = model.model.diffusion_model
+    wrapper = VideoBackboneForExport(wma)
+    wrapper.eval().cuda()
+
+    for m in wrapper.modules():
+        if hasattr(m, 'checkpoint'):
+            m.checkpoint = False
+        if hasattr(m, 'use_checkpoint'):
+            m.use_checkpoint = False
+
+    import xformers.ops
+    _orig_mea = xformers.ops.memory_efficient_attention
+    def _sdpa_replacement(q, k, v, attn_bias=None, op=None, **kw):
+        return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
+    xformers.ops.memory_efficient_attention = _sdpa_replacement
+
+    BT = 16
+    emb_dim = wma.model_channels * 4
+    ctx_dim = 1024
+    in_ch = wma.in_channels
+
+    dummy_h = torch.randn(BT, in_ch, 40, 64, device='cuda', dtype=torch.float32)
+    dummy_emb = torch.randn(BT, emb_dim, device='cuda', dtype=torch.float32)
+    dummy_ctx = torch.randn(BT, context_len, ctx_dim, device='cuda', dtype=torch.float32)
+
+    with torch.no_grad():
+        outputs = wrapper(dummy_h, dummy_emb, dummy_ctx)
+        n_outputs = len(outputs)
+        print(f">>> Backbone has {n_outputs} outputs (1 y + {n_outputs-1} hs_a)")
+        for i, o in enumerate(outputs):
+            print(f"    output[{i}]: {o.shape} {o.dtype}")
+
+    output_names = ['y'] + [f'hs_a_{i}' for i in range(n_outputs - 1)]
+
+    torch.onnx.export(
+        wrapper,
+        (dummy_h, dummy_emb, dummy_ctx),
+        save_path,
+        input_names=['h', 'emb', 'context'],
+        output_names=output_names,
+        opset_version=17,
+        do_constant_folding=True,
+    )
+    print(f">>> ONNX exported to {save_path}")
+    xformers.ops.memory_efficient_attention = _orig_mea
+    return n_outputs
+
+
+class TRTBackbone:
+    """TensorRT runtime wrapper for the video UNet backbone."""
+
+    def __init__(self, engine_path, n_hs_a=9):
+        import tensorrt as trt
+
+        self.logger = trt.Logger(trt.Logger.WARNING)
+        with open(engine_path, 'rb') as f:
+            runtime = trt.Runtime(self.logger)
+            self.engine = runtime.deserialize_cuda_engine(f.read())
+        self.context = self.engine.create_execution_context()
+        self.n_hs_a = n_hs_a
+
+        import numpy as np
+        self.output_buffers = {}
+        for i in range(self.engine.num_io_tensors):
+            name = self.engine.get_tensor_name(i)
+            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
+                shape = self.engine.get_tensor_shape(name)
+                np_dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+                buf = torch.empty(list(shape), dtype=torch.from_numpy(np.empty(0, dtype=np_dtype)).dtype, device='cuda')
+                self.output_buffers[name] = buf
+                print(f"    TRT output '{name}': {list(shape)} {buf.dtype}")
+
+    def __call__(self, h, emb, context):
+        import tensorrt as trt
+        for name, tensor in [('h', h), ('emb', emb), ('context', context)]:
+            expected_dtype = trt.nptype(self.engine.get_tensor_dtype(name))
+            torch_expected = torch.from_numpy(__import__('numpy').empty(0, dtype=expected_dtype)).dtype
+            if tensor.dtype != torch_expected:
+                tensor = tensor.to(torch_expected)
+            self.context.set_tensor_address(name, tensor.contiguous().data_ptr())
+
+        for name, buf in self.output_buffers.items():
+            self.context.set_tensor_address(name, buf.data_ptr())
+
+        self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream)
+        torch.cuda.synchronize()
+
+        y = self.output_buffers['y']
+        hs_a = [self.output_buffers[f'hs_a_{i}'] for i in range(self.n_hs_a)]
+        return y, hs_a
--- a/unitree_g1_pack_camera/case1/output.log
+++ b/unitree_g1_pack_camera/case1/output.log
@@ -0,0 +1,179 @@
+2026-02-18 19:01:56.891895: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:01:56.940243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:01:56.940285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:01:56.941395: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:01:56.948327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:01:57.870809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:02:10] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:17<02:51, 17.15s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+
+ 18%|█▊        | 2/11 [00:33<02:31, 16.87s/it]
+ 27%|██▋       | 3/11 [00:50<02:14, 16.76s/it]
+ 36%|███▋      | 4/11 [01:07<01:57, 16.81s/it]
+ 45%|████▌     | 5/11 [01:24<01:41, 16.85s/it]
+ 55%|█████▍    | 6/11 [01:41<01:24, 16.82s/it]
+ 64%|██████▎   | 7/11 [01:57<01:07, 16.82s/it]
+ 73%|███████▎  | 8/11 [02:14<00:50, 16.83s/it]
+ 82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
+ 91%|█████████ | 10/11 [02:48<00:16, 16.81s/it]
+100%|██████████| 11/11 [03:05<00:00, 16.81s/it]
+100%|██████████| 11/11 [03:05<00:00, 16.83s/it]
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 5: generating actions ...
+>>> Step 5: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 6: generating actions ...
+>>> Step 6: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case1/psnr_result.json
+++ b/unitree_g1_pack_camera/case1/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4",
+    "pred_video": "unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4",
+    "psnr": 35.615362167470806
+}
--- a/unitree_g1_pack_camera/case2/output.log
+++ b/unitree_g1_pack_camera/case2/output.log
@@ -0,0 +1,179 @@
+2026-02-18 19:05:45.956647: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:05:46.004149: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:05:46.004193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:05:46.005265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:05:46.012074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:05:46.932966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:05:59] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:16<02:47, 16.71s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+
+ 18%|█▊        | 2/11 [00:33<02:30, 16.75s/it]
+ 27%|██▋       | 3/11 [00:50<02:15, 16.91s/it]
+ 36%|███▋      | 4/11 [01:07<01:59, 17.02s/it]
+ 45%|████▌     | 5/11 [01:24<01:41, 16.98s/it]
+ 55%|█████▍    | 6/11 [01:41<01:24, 16.94s/it]
+ 64%|██████▎   | 7/11 [01:58<01:07, 16.90s/it]
+ 73%|███████▎  | 8/11 [02:15<00:50, 16.83s/it]
+ 82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
+ 91%|█████████ | 10/11 [02:49<00:16, 16.94s/it]
+100%|██████████| 11/11 [03:06<00:00, 16.97s/it]
+100%|██████████| 11/11 [03:06<00:00, 16.91s/it]
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 5: generating actions ...
+>>> Step 5: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 6: generating actions ...
+>>> Step 6: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_g1_pack_camera/case2/psnr_result.json
+++ b/unitree_g1_pack_camera/case2/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_g1_pack_camera/case2/unitree_g1_pack_camera_case2.mp4",
+    "pred_video": "unitree_g1_pack_camera/case2/output/inference/50_full_fs6.mp4",
+    "psnr": 34.61979248212279
+}
--- a/unitree_g1_pack_camera/case3/output.log
+++ b/unitree_g1_pack_camera/case3/output.log
@@ -0,0 +1,146 @@
+2026-02-18 19:09:35.113634: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 19:09:35.161428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 19:09:35.161474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 19:09:35.162551: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 19:09:35.169325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 19:09:36.089250: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-19:09:49] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:16<02:45, 16.53s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
--- a/unitree_g1_pack_camera/case3/psnr_result.json
+++ b/unitree_g1_pack_camera/case3/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_g1_pack_camera/case3/unitree_g1_pack_camera_case3.mp4",
+    "pred_video": "unitree_g1_pack_camera/case3/output/inference/100_full_fs6.mp4",
+    "psnr": 37.034952654534486
+}
--- a/unitree_g1_pack_camera/case4/psnr_result.json
+++ b/unitree_g1_pack_camera/case4/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_g1_pack_camera/case4/unitree_g1_pack_camera_case4.mp4",
+    "pred_video": "unitree_g1_pack_camera/case4/output/inference/200_full_fs6.mp4",
+    "psnr": 31.43390896360405
+}
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case2/unitree_z1_dual_arm_cleanup_pencils_case2.mp4",
+    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case2/output/inference/50_full_fs4.mp4",
+    "psnr": 48.344571927558974
+}
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case3/unitree_z1_dual_arm_cleanup_pencils_case3.mp4",
+    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case3/output/inference/100_full_fs4.mp4",
+    "psnr": 41.152374490134825
+}
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_cleanup_pencils/case4/unitree_z1_dual_arm_cleanup_pencils_case4.mp4",
+    "pred_video": "unitree_z1_dual_arm_cleanup_pencils/case4/output/inference/200_full_fs4.mp4",
+    "psnr": 46.025723557253855
+}
--- a/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox/case1/unitree_z1_dual_arm_stackbox_case1.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox/case1/output/inference/5_full_fs4.mp4",
+    "psnr": 44.3480149502738
+}
--- a/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox/case2/unitree_z1_dual_arm_stackbox_case2.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox/case2/output/inference/15_full_fs4.mp4",
+    "psnr": 39.867728254007716
+}
--- a/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox/case3/unitree_z1_dual_arm_stackbox_case3.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox/case3/output/inference/25_full_fs4.mp4",
+    "psnr": 39.19101039445159
+}
--- a/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox/case4/unitree_z1_dual_arm_stackbox_case4.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox/case4/output/inference/35_full_fs4.mp4",
+    "psnr": 40.29563315341769
+}
--- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log
@@ -1,10 +1,10 @@
-2026-02-11 11:59:27.241485: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-11 11:59:27.291755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-11 11:59:27.291807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-11 11:59:27.293169: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-11 11:59:27.300838: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-18 18:49:49.117856: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 18:49:49.165270: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 18:49:49.165322: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 18:49:49.166382: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 18:49:49.173299: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-11 11:59:28.228009: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-18 18:49:50.090214: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
 >>> Prepared model loaded.
@@ -26,12 +26,24 @@ INFO:root:***** Configing Data *****
 >>> unitree_g1_pack_camera: normalizer initiated.
 >>> Dataset is successfully loaded ...
    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
 >>> Generate 16 frames under each generation ...
 DEBUG:h5py._conv:Creating converter from 3 to 5
 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
-
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-18:50:03] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.

  9%|▉         | 1/11 [00:15<02:38, 15.88s/it]>>> Step 0: generating actions ...
 >>> Step 0: interacting with world model ...
@@ -84,7 +96,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing WebPImagePlugin
 DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
-DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin

 18%|█▊        | 2/11 [00:31<02:21, 15.71s/it]
@@ -115,6 +127,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
 >>> Step 6: generating actions ...
 >>> Step 6: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 7: generating actions ...
->>> Step 7: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
@@ -1,5 +1,5 @@
 {
-    "gt_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
-    "pred_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
-    "psnr": 28.167025381705358
+    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
+    "psnr": 27.62636266067224
 }
--- a/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case2/unitree_z1_dual_arm_stackbox_v2_case2.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case2/output/inference/15_full_fs4.mp4",
+    "psnr": 33.90444714332389
+}
--- a/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case3/unitree_z1_dual_arm_stackbox_v2_case3.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case3/output/inference/25_full_fs4.mp4",
+    "psnr": 34.50192428908007
+}
--- a/unitree_z1_dual_arm_stackbox_v2/case4/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/output.log
@@ -0,0 +1,179 @@
+2026-02-18 18:54:56.403136: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-18 18:54:56.451144: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-18 18:54:56.451189: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-18 18:54:56.452312: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-18 18:54:56.459281: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2026-02-18 18:54:57.381032: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+Global seed set to 123
+>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
+>>> Prepared model loaded.
+INFO:root:***** Configing Data *****
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Dataset is successfully loaded ...
+    ✓ KV fused: 66 attention layers
+    TRT output 'y': [1, 4, 16, 40, 64] torch.float32
+    TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
+    TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
+    TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
+    TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
+    TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
+>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
+>>> Generate 16 frames under each generation ...
+DEBUG:h5py._conv:Creating converter from 3 to 5
+DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
+DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
+DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
+
+  0%|          | 0/11 [00:00<?, ?it/s][02/18/2026-18:55:10] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
+
+  9%|▉         | 1/11 [00:16<02:45, 16.53s/it]>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BlpImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing BmpImagePlugin
+DEBUG:PIL.Image:Importing BufrStubImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing CurImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DcxImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing DdsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing EpsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FitsStubImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FliImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Importing FpxImagePlugin
+DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing FtexImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GbrImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing GifImagePlugin
+DEBUG:PIL.Image:Importing GribStubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcnsImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing IcoImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing ImtImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing IptcImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing JpegImagePlugin
+DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
+DEBUG:PIL.Image:Importing McIdasImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Importing MicImagePlugin
+DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpegImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MpoImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing MspImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PalmImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcdImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PcxImagePlugin
+DEBUG:PIL.Image:Importing PdfImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PixarImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing PngImagePlugin
+DEBUG:PIL.Image:Importing PpmImagePlugin
+DEBUG:PIL.Image:Importing PsdImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing QoiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SgiImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing TiffImagePlugin
+DEBUG:PIL.Image:Importing WebPImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing WmfImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XbmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XpmImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+DEBUG:PIL.Image:Importing XVThumbImagePlugin
+
+ 18%|█▊        | 2/11 [00:33<02:28, 16.52s/it]
+ 27%|██▋       | 3/11 [00:49<02:12, 16.53s/it]
+ 36%|███▋      | 4/11 [01:06<01:56, 16.64s/it]
+ 45%|████▌     | 5/11 [01:23<01:40, 16.69s/it]
+ 55%|█████▍    | 6/11 [01:39<01:23, 16.71s/it]
+ 64%|██████▎   | 7/11 [01:56<01:06, 16.68s/it]
+ 73%|███████▎  | 8/11 [02:13<00:50, 16.68s/it]
+ 82%|████████▏ | 9/11 [02:29<00:33, 16.71s/it]
+ 91%|█████████ | 10/11 [02:46<00:16, 16.72s/it]
+100%|██████████| 11/11 [03:03<00:00, 16.69s/it]
+100%|██████████| 11/11 [03:03<00:00, 16.67s/it]
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 5: generating actions ...
+>>> Step 5: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 6: generating actions ...
+>>> Step 6: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 7: generating actions ...
+>>> Step 7: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
--- a/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_dual_arm_stackbox_v2/case4/unitree_z1_dual_arm_stackbox_v2_case4.mp4",
+    "pred_video": "unitree_z1_dual_arm_stackbox_v2/case4/output/inference/35_full_fs4.mp4",
+    "psnr": 25.49270910031428
+}
--- a/unitree_z1_stackbox/case1/psnr_result.json
+++ b/unitree_z1_stackbox/case1/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_stackbox/case1/unitree_z1_stackbox_case1.mp4",
+    "pred_video": "unitree_z1_stackbox/case1/output/inference/5_full_fs4.mp4",
+    "psnr": 42.83913947323794
+}
--- a/unitree_z1_stackbox/case2/psnr_result.json
+++ b/unitree_z1_stackbox/case2/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_stackbox/case2/unitree_z1_stackbox_case2.mp4",
+    "pred_video": "unitree_z1_stackbox/case2/output/inference/15_full_fs4.mp4",
+    "psnr": 48.64571989587276
+}
--- a/unitree_z1_stackbox/case3/psnr_result.json
+++ b/unitree_z1_stackbox/case3/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_stackbox/case3/unitree_z1_stackbox_case3.mp4",
+    "pred_video": "unitree_z1_stackbox/case3/output/inference/25_full_fs4.mp4",
+    "psnr": 45.127553229898034
+}
--- a/unitree_z1_stackbox/case4/psnr_result.json
+++ b/unitree_z1_stackbox/case4/psnr_result.json
@@ -0,0 +1,5 @@
+{
+    "gt_video": "unitree_z1_stackbox/case4/unitree_z1_stackbox_case4.mp4",
+    "pred_video": "unitree_z1_stackbox/case4/output/inference/35_full_fs4.mp4",
+    "psnr": 50.642542240144444
+}