diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py index e2ba396..51bd803 100644 --- a/scripts/evaluation/world_model_interaction.py +++ b/scripts/evaluation/world_model_interaction.py @@ -572,9 +572,9 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: print(f">>> Prepared model saved ({os.path.getsize(prepared_path) / 1024**3:.1f} GB).") # ---- BF16: only convert the diffusion backbone, keep VAE/CLIP/embedder in FP32 ---- - model.model.to(torch.bfloat16) - model.model.diffusion_model.dtype = torch.bfloat16 - print(">>> Diffusion backbone (model.model) converted to BF16.") + model.model.to(torch.float16) + model.model.diffusion_model.dtype = torch.float16 + print(">>> Diffusion backbone (model.model) converted to FP16.") # Build normalizer (always needed, independent of model loading path) logging.info("***** Configing Data *****") diff --git a/src/unifolm_wma/models/ddpms.py b/src/unifolm_wma/models/ddpms.py index fbf2042..7e596ac 100644 --- a/src/unifolm_wma/models/ddpms.py +++ b/src/unifolm_wma/models/ddpms.py @@ -2457,7 +2457,17 @@ class DiffusionWrapper(pl.LightningModule): Returns: Output from the inner diffusion model (tensor or tuple, depending on the model). """ + with torch.cuda.amp.autocast(dtype=torch.float16): + return self._forward_impl(x, x_action, x_state, t, + c_concat, c_crossattn, c_crossattn_action, + c_adm, s, mask, **kwargs) + def _forward_impl( + self, + x, x_action, x_state, t, + c_concat=None, c_crossattn=None, c_crossattn_action=None, + c_adm=None, s=None, mask=None, **kwargs, + ): if self.conditioning_key is None: out = self.diffusion_model(x, t) elif self.conditioning_key == 'concat': diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/output.log b/unitree_z1_dual_arm_stackbox_v2/case1/output.log index b666f5a..1b8beb8 100644 --- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log +++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log @@ -1,13 +1,14 @@ -2026-02-11 15:03:49.644187: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. -2026-02-11 15:03:49.694117: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered -2026-02-11 15:03:49.694162: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered -2026-02-11 15:03:49.695456: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered -2026-02-11 15:03:49.702946: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +2026-02-11 16:14:08.942290: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +2026-02-11 16:14:08.992267: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered +2026-02-11 16:14:08.992319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered +2026-02-11 16:14:08.993621: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered +2026-02-11 16:14:09.001096: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. -2026-02-11 15:03:50.638334: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT +2026-02-11 16:14:09.927986: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT Global seed set to 123 >>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ... >>> Prepared model loaded. +>>> Diffusion backbone (model.model) converted to FP16. INFO:root:***** Configing Data ***** >>> unitree_z1_stackbox: 1 data samples loaded. >>> unitree_z1_stackbox: data stats loaded. @@ -31,7 +32,7 @@ DEBUG:h5py._conv:Creating converter from 3 to 5 DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13 DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096 - 0%| | 0/11 [00:00>> Step 0: generating actions ... + 0%| | 0/11 [00:00>> Step 0: generating actions ... >>> Step 0: interacting with world model ... >>>>>>>>>>>>>>>>>>>>>>>> >>> Step 1: generating actions ... @@ -84,7 +85,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin DEBUG:PIL.Image:Importing XbmImagePlugin DEBUG:PIL.Image:Importing XpmImagePlugin DEBUG:PIL.Image:Importing XVThumbImagePlugin - 18%|█▊ | 2/11 [01:08<05:08, 34.26s/it] 27%|██▋ | 3/11 [01:42<04:33, 34.17s/it] 36%|███▋ | 4/11 [02:16<03:59, 34.15s/it] 45%|████▌ | 5/11 [02:50<03:24, 34.11s/it] 55%|█████▍ | 6/11 [03:24<02:50, 34.09s/it] 64%|██████▎ | 7/11 [03:58<02:16, 34.07s/it] 73%|███████▎ | 8/11 [04:32<01:42, 34.04s/it] 82%|████████▏ | 9/11 [05:06<01:08, 34.05s/it] 91%|█████████ | 10/11 [05:40<00:34, 34.07s/it] 100%|██████████| 11/11 [06:15<00:00, 34.07s/it] 100%|██████████| 11/11 [06:15<00:00, 34.09s/it] + 18%|█▊ | 2/11 [00:47<03:32, 23.66s/it] 27%|██▋ | 3/11 [01:11<03:09, 23.66s/it] 36%|███▋ | 4/11 [01:34<02:45, 23.68s/it] 45%|████▌ | 5/11 [01:58<02:21, 23.65s/it] 55%|█████▍ | 6/11 [02:21<01:58, 23.63s/it] 64%|██████▎ | 7/11 [02:45<01:34, 23.61s/it] 73%|███████▎ | 8/11 [03:09<01:10, 23.61s/it] 82%|████████▏ | 9/11 [03:32<00:47, 23.60s/it] 91%|█████████ | 10/11 [03:56<00:23, 23.59s/it] 100%|██████████| 11/11 [04:19<00:00, 23.59s/it] 100%|██████████| 11/11 [04:19<00:00, 23.62s/it] >>> Step 1: interacting with world model ... >>>>>>>>>>>>>>>>>>>>>>>> >>> Step 2: generating actions ... @@ -115,6 +116,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin >>> Step 10: interacting with world model ... >>>>>>>>>>>>>>>>>>>>>>>> -real 6m56.031s -user 6m22.485s -sys 1m23.599s +real 5m0.723s +user 5m6.734s +sys 0m43.469s diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json b/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json index 96bca2f..05b5cf9 100644 --- a/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json +++ b/unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json @@ -1,5 +1,5 @@ { "gt_video": "unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4", "pred_video": "unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4", - "psnr": 28.167025381705358 + "psnr": 25.21894470816415 } \ No newline at end of file