From 4288c9d8c9909d6c6bf856cc0b7625e0bd5abb66 Mon Sep 17 00:00:00 2001
From: olivame <dyz@olivame.xyz>
Date: Mon, 9 Feb 2026 16:48:16 +0000
Subject: [PATCH] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=BA=86=E4=B8=80=E8=B7=AF?=
 =?UTF-8?q?=E8=A7=86=E9=A2=91vae=E8=A7=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/evaluation/world_model_interaction.py | 44 ++++++++++++-------
 .../case1/output.log                          | 24 +++++-----
 .../case1/psnr_result1.json                   |  2 +-
 .../case1/run_world_model_interaction.sh      |  3 +-
 4 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py
index 60d7e8f..3270dda 100644
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -444,7 +444,8 @@ def image_guided_synthesis_sim_mode(
         timestep_spacing: str = 'uniform',
         guidance_rescale: float = 0.0,
         sim_mode: bool = True,
-        **kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        decode_video: bool = True,
+        **kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
     """
     Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text).
 
@@ -467,10 +468,13 @@ def image_guided_synthesis_sim_mode(
         timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace".
         guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance.
         sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model.
+        decode_video (bool): Whether to decode latent samples to pixel-space video.
+            Set to False to skip VAE decode for speed when only actions/states are needed.
         **kwargs: Additional arguments passed to the DDIM sampler.
 
     Returns:
-        batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W].
+        batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W],
+            or None when decode_video=False.
         actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding.
         states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding.
     """
@@ -554,6 +558,7 @@ def image_guided_synthesis_sim_mode(
     else:
         autocast_ctx = nullcontext()
 
+    batch_variants = None
     if ddim_sampler is not None:
         with autocast_ctx:
             samples, actions, states, intermedia = ddim_sampler.sample(
@@ -573,9 +578,10 @@ def image_guided_synthesis_sim_mode(
             guidance_rescale=guidance_rescale,
             **kwargs)
 
-        # Reconstruct from latent to pixel space
-        batch_images = model.decode_first_stage(samples)
-        batch_variants = batch_images
+        if decode_video:
+            # Reconstruct from latent to pixel space
+            batch_images = model.decode_first_stage(samples)
+            batch_variants = batch_images
 
     return batch_variants, actions, states
 
@@ -750,7 +756,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                     fs=model_input_fs,
                     timestep_spacing=args.timestep_spacing,
                     guidance_rescale=args.guidance_rescale,
-                    sim_mode=False)
+                    sim_mode=False,
+                    decode_video=not args.fast_policy_no_decode)
 
                 # Update future actions in the observation queues
                 for idx in range(len(pred_actions[0])):
@@ -808,11 +815,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                                       observation)
 
                 # Save the imagen videos for decision-making
-                sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
-                log_to_tensorboard(writer,
-                                   pred_videos_0,
-                                   sample_tag,
-                                   fps=args.save_fps)
+                if pred_videos_0 is not None:
+                    sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
+                    log_to_tensorboard(writer,
+                                       pred_videos_0,
+                                       sample_tag,
+                                       fps=args.save_fps)
                 # Save videos environment changes via world-model interaction
                 sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
                 log_to_tensorboard(writer,
@@ -821,10 +829,11 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                    fps=args.save_fps)
 
                 # Save the imagen videos for decision-making
-                sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
-                save_results(pred_videos_0.cpu(),
-                             sample_video_file,
-                             fps=args.save_fps)
+                if pred_videos_0 is not None:
+                    sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
+                    save_results(pred_videos_0.cpu(),
+                                 sample_video_file,
+                                 fps=args.save_fps)
                 # Save videos environment changes via world-model interaction
                 sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
                 save_results(pred_videos_1.cpu(),
@@ -957,6 +966,11 @@ def get_parser():
                         action='store_true',
                         default=False,
                         help="not using the predicted states as comparison")
+    parser.add_argument(
+        "--fast_policy_no_decode",
+        action='store_true',
+        default=False,
+        help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.")
     parser.add_argument("--save_fps",
                         type=int,
                         default=8,
diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
index 2bb97d7..5116dc6 100644
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -1,14 +1,14 @@
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
   __import__("pkg_resources").declare_namespace(__name__)
-2026-02-08 18:43:46.463492: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-08 18:43:46.466714: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-08 18:43:46.498994: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-08 18:43:46.499029: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-08 18:43:46.500865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-08 18:43:46.509069: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-08 18:43:46.509359: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-09 16:37:01.511249: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-09 16:37:01.514371: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-09 16:37:01.545068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-09 16:37:01.545097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-09 16:37:01.546937: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-09 16:37:01.555024: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-09 16:37:01.555338: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-08 18:43:47.434136: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-09 16:37:02.212554: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 [rank: 0] Global seed set to 123
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
   @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
@@ -116,7 +116,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
- 12%|█▎        | 1/8 [01:12<08:27, 72.57s/it] 25%|██▌       | 2/8 [02:21<07:02, 70.44s/it] 38%|███▊      | 3/8 [03:30<05:48, 69.76s/it] 50%|█████     | 4/8 [04:39<04:37, 69.48s/it] 62%|██████▎   | 5/8 [05:48<03:27, 69.31s/it] 75%|███████▌  | 6/8 [06:57<02:18, 69.19s/it] 88%|████████▊ | 7/8 [08:06<01:09, 69.04s/it]100%|██████████| 8/8 [09:15<00:00, 69.05s/it]100%|██████████| 8/8 [09:15<00:00, 69.41s/it]
+ 12%|█▎        | 1/8 [01:11<08:20, 71.56s/it] 25%|██▌       | 2/8 [02:19<06:56, 69.36s/it] 38%|███▊      | 3/8 [03:27<05:43, 68.67s/it] 50%|█████     | 4/8 [04:35<04:33, 68.41s/it] 62%|██████▎   | 5/8 [05:43<03:25, 68.38s/it] 75%|███████▌  | 6/8 [06:51<02:16, 68.18s/it] 88%|████████▊ | 7/8 [07:59<01:08, 68.01s/it]100%|██████████| 8/8 [09:07<00:00, 68.02s/it]100%|██████████| 8/8 [09:07<00:00, 68.38s/it]
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
@@ -140,6 +140,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
 >>> Step 7: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 
-real	10m17.951s
-user	11m44.955s
-sys	0m40.480s
+real	10m15.640s
+user	11m34.152s
+sys	0m48.021s
diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
index e9d3dba..5d699db 100644
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
@@ -1,5 +1,5 @@
 {
     "gt_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/unitree_z1_dual_arm_cleanup_pencils_case1_amd.mp4",
     "pred_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
-    "psnr": 30.058508734449845
+    "psnr": 31.802224855380352
 }
\ No newline at end of file
diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
index a7ad4bf..698fae2 100644
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh
@@ -21,5 +21,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
         --timestep_spacing 'uniform_trailing' \
         --guidance_rescale 0.7 \
         --perframe_ae \
-        --vae_dtype bf16
+        --vae_dtype bf16 \
+        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"