From 4288c9d8c9909d6c6bf856cc0b7625e0bd5abb66 Mon Sep 17 00:00:00 2001 From: olivame Date: Mon, 9 Feb 2026 16:48:16 +0000 Subject: [PATCH] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=BA=86=E4=B8=80=E8=B7=AF?= =?UTF-8?q?=E8=A7=86=E9=A2=91vae=E8=A7=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/evaluation/world_model_interaction.py | 44 ++++++++++++------- .../case1/output.log | 24 +++++----- .../case1/psnr_result1.json | 2 +- .../case1/run_world_model_interaction.sh | 3 +- 4 files changed, 44 insertions(+), 29 deletions(-) diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py index 60d7e8f..3270dda 100644 --- a/scripts/evaluation/world_model_interaction.py +++ b/scripts/evaluation/world_model_interaction.py @@ -444,7 +444,8 @@ def image_guided_synthesis_sim_mode( timestep_spacing: str = 'uniform', guidance_rescale: float = 0.0, sim_mode: bool = True, - **kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + decode_video: bool = True, + **kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]: """ Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text). @@ -467,10 +468,13 @@ def image_guided_synthesis_sim_mode( timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace". guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance. sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model. + decode_video (bool): Whether to decode latent samples to pixel-space video. + Set to False to skip VAE decode for speed when only actions/states are needed. **kwargs: Additional arguments passed to the DDIM sampler. Returns: - batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W]. + batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W], + or None when decode_video=False. actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding. states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding. """ @@ -554,6 +558,7 @@ def image_guided_synthesis_sim_mode( else: autocast_ctx = nullcontext() + batch_variants = None if ddim_sampler is not None: with autocast_ctx: samples, actions, states, intermedia = ddim_sampler.sample( @@ -573,9 +578,10 @@ def image_guided_synthesis_sim_mode( guidance_rescale=guidance_rescale, **kwargs) - # Reconstruct from latent to pixel space - batch_images = model.decode_first_stage(samples) - batch_variants = batch_images + if decode_video: + # Reconstruct from latent to pixel space + batch_images = model.decode_first_stage(samples) + batch_variants = batch_images return batch_variants, actions, states @@ -750,7 +756,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: fs=model_input_fs, timestep_spacing=args.timestep_spacing, guidance_rescale=args.guidance_rescale, - sim_mode=False) + sim_mode=False, + decode_video=not args.fast_policy_no_decode) # Update future actions in the observation queues for idx in range(len(pred_actions[0])): @@ -808,11 +815,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: observation) # Save the imagen videos for decision-making - sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}" - log_to_tensorboard(writer, - pred_videos_0, - sample_tag, - fps=args.save_fps) + if pred_videos_0 is not None: + sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}" + log_to_tensorboard(writer, + pred_videos_0, + sample_tag, + fps=args.save_fps) # Save videos environment changes via world-model interaction sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}" log_to_tensorboard(writer, @@ -821,10 +829,11 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: fps=args.save_fps) # Save the imagen videos for decision-making - sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4' - save_results(pred_videos_0.cpu(), - sample_video_file, - fps=args.save_fps) + if pred_videos_0 is not None: + sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4' + save_results(pred_videos_0.cpu(), + sample_video_file, + fps=args.save_fps) # Save videos environment changes via world-model interaction sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4' save_results(pred_videos_1.cpu(), @@ -957,6 +966,11 @@ def get_parser(): action='store_true', default=False, help="not using the predicted states as comparison") + parser.add_argument( + "--fast_policy_no_decode", + action='store_true', + default=False, + help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.") parser.add_argument("--save_fps", type=int, default=8, diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log index 2bb97d7..5116dc6 100644 --- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log +++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log @@ -1,14 +1,14 @@ /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. __import__("pkg_resources").declare_namespace(__name__) -2026-02-08 18:43:46.463492: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. -2026-02-08 18:43:46.466714: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. -2026-02-08 18:43:46.498994: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered -2026-02-08 18:43:46.499029: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered -2026-02-08 18:43:46.500865: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered -2026-02-08 18:43:46.509069: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. -2026-02-08 18:43:46.509359: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +2026-02-09 16:37:01.511249: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +2026-02-09 16:37:01.514371: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. +2026-02-09 16:37:01.545068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered +2026-02-09 16:37:01.545097: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered +2026-02-09 16:37:01.546937: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered +2026-02-09 16:37:01.555024: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. +2026-02-09 16:37:01.555338: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. -2026-02-08 18:43:47.434136: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT +2026-02-09 16:37:02.212554: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT [rank: 0] Global seed set to 123 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead. @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) @@ -116,7 +116,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin DEBUG:PIL.Image:Importing XbmImagePlugin DEBUG:PIL.Image:Importing XpmImagePlugin DEBUG:PIL.Image:Importing XVThumbImagePlugin - 12%|█▎ | 1/8 [01:12<08:27, 72.57s/it] 25%|██▌ | 2/8 [02:21<07:02, 70.44s/it] 38%|███▊ | 3/8 [03:30<05:48, 69.76s/it] 50%|█████ | 4/8 [04:39<04:37, 69.48s/it] 62%|██████▎ | 5/8 [05:48<03:27, 69.31s/it] 75%|███████▌ | 6/8 [06:57<02:18, 69.19s/it] 88%|████████▊ | 7/8 [08:06<01:09, 69.04s/it] 100%|██████████| 8/8 [09:15<00:00, 69.05s/it] 100%|██████████| 8/8 [09:15<00:00, 69.41s/it] + 12%|█▎ | 1/8 [01:11<08:20, 71.56s/it] 25%|██▌ | 2/8 [02:19<06:56, 69.36s/it] 38%|███▊ | 3/8 [03:27<05:43, 68.67s/it] 50%|█████ | 4/8 [04:35<04:33, 68.41s/it] 62%|██████▎ | 5/8 [05:43<03:25, 68.38s/it] 75%|███████▌ | 6/8 [06:51<02:16, 68.18s/it] 88%|████████▊ | 7/8 [07:59<01:08, 68.01s/it] 100%|██████████| 8/8 [09:07<00:00, 68.02s/it] 100%|██████████| 8/8 [09:07<00:00, 68.38s/it] >>>>>>>>>>>>>>>>>>>>>>>> >>> Step 1: generating actions ... >>> Step 1: interacting with world model ... @@ -140,6 +140,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin >>> Step 7: interacting with world model ... >>>>>>>>>>>>>>>>>>>>>>>> -real 10m17.951s -user 11m44.955s -sys 0m40.480s +real 10m15.640s +user 11m34.152s +sys 0m48.021s diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json index e9d3dba..5d699db 100644 --- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json +++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json @@ -1,5 +1,5 @@ { "gt_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/unitree_z1_dual_arm_cleanup_pencils_case1_amd.mp4", "pred_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4", - "psnr": 30.058508734449845 + "psnr": 31.802224855380352 } \ No newline at end of file diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh index a7ad4bf..698fae2 100644 --- a/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh +++ b/unitree_z1_dual_arm_cleanup_pencils/case1/run_world_model_interaction.sh @@ -21,5 +21,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils" --timestep_spacing 'uniform_trailing' \ --guidance_rescale 0.7 \ --perframe_ae \ - --vae_dtype bf16 + --vae_dtype bf16 \ + --fast_policy_no_decode } 2>&1 | tee "${res_dir}/output.log"