2 Commits

Author SHA1 Message Date
a09d35ae5b - state_unet 放到一个独立的 CUDA stream 上执行
- action_unet 在默认 stream 上同时执行
  - 用 wait_stream 确保两者都完成后再返回
两个 1D UNet 输入完全独立,共享的 hs_a 和 context_action 都是只读的。GPU 利用率只有 ~31%,小张量 kernel 不会打满 GPU,两个 stream 可以真正并行。
2026-02-10 10:47:10 +00:00
db848bca01 profile 结果 2026-02-10 07:02:20 +00:00
13 changed files with 132 additions and 198 deletions

1
.gitignore vendored
View File

@@ -130,3 +130,4 @@ Experiment/log
*.ckpt *.ckpt
*.0 *.0
unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/traces/wx-ms-w7900d-0032_742306.1770698186047591119.pt.trace.json

View File

@@ -222,7 +222,7 @@ data:
test: test:
target: unifolm_wma.data.wma_data.WMAData target: unifolm_wma.data.wma_data.WMAData
params: params:
data_dir: '/home/qhy/unifolm-world-model-action/examples/world_model_interaction_prompts' data_dir: '/mnt/ASC1637/unifolm-world-model-action/examples/world_model_interaction_prompts'
video_length: ${model.params.wma_config.params.temporal_length} video_length: ${model.params.wma_config.params.temporal_length}
frame_stride: 2 frame_stride: 2
load_raw_resolution: True load_raw_resolution: True

View File

@@ -1,7 +1,5 @@
import argparse, os, glob import argparse, os, glob
from contextlib import nullcontext from contextlib import nullcontext
import atexit
from concurrent.futures import ThreadPoolExecutor
import pandas as pd import pandas as pd
import random import random
import torch import torch
@@ -13,15 +11,13 @@ import einops
import warnings import warnings
import imageio import imageio
from typing import Optional, List, Any
from pytorch_lightning import seed_everything from pytorch_lightning import seed_everything
from omegaconf import OmegaConf from omegaconf import OmegaConf
from tqdm import tqdm from tqdm import tqdm
from einops import rearrange, repeat from einops import rearrange, repeat
from collections import OrderedDict from collections import OrderedDict
from torch import nn from torch import nn
from eval_utils import populate_queues from eval_utils import populate_queues, log_to_tensorboard
from collections import deque from collections import deque
from torch import Tensor from torch import Tensor
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
@@ -32,80 +28,6 @@ from unifolm_wma.utils.utils import instantiate_from_config
import torch.nn.functional as F import torch.nn.functional as F
# ========== Async I/O utilities ==========
_io_executor: Optional[ThreadPoolExecutor] = None
_io_futures: List[Any] = []
def _get_io_executor() -> ThreadPoolExecutor:
global _io_executor
if _io_executor is None:
_io_executor = ThreadPoolExecutor(max_workers=2)
return _io_executor
def _flush_io():
"""Wait for all pending async I/O to finish."""
global _io_futures
for fut in _io_futures:
try:
fut.result()
except Exception as e:
print(f">>> [async I/O] error: {e}")
_io_futures.clear()
atexit.register(_flush_io)
def _save_results_sync(video_cpu: Tensor, filename: str, fps: int) -> None:
"""Synchronous save on CPU tensor (runs in background thread)."""
video = torch.clamp(video_cpu.float(), -1., 1.)
n = video.shape[0]
video = video.permute(2, 0, 1, 3, 4)
frame_grids = [
torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
for framesheet in video
]
grid = torch.stack(frame_grids, dim=0)
grid = (grid + 1.0) / 2.0
grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
torchvision.io.write_video(filename,
grid,
fps=fps,
video_codec='h264',
options={'crf': '10'})
def save_results_async(video: Tensor, filename: str, fps: int = 8) -> None:
"""Submit video saving to background thread pool."""
video_cpu = video.detach().cpu()
fut = _get_io_executor().submit(_save_results_sync, video_cpu, filename, fps)
_io_futures.append(fut)
def _log_to_tb_sync(video_cpu: Tensor, writer: SummaryWriter, tag: str, fps: int) -> None:
"""Synchronous tensorboard logging on CPU tensor (runs in background thread)."""
video = video_cpu.float()
n = video.shape[0]
video = video.permute(2, 0, 1, 3, 4)
frame_grids = [
torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
for framesheet in video
]
grid = torch.stack(frame_grids, dim=0)
grid = (grid + 1.0) / 2.0
grid = grid.unsqueeze(dim=0)
writer.add_video(tag, grid, fps=fps)
def log_to_tensorboard_async(writer: SummaryWriter, video: Tensor, tag: str, fps: int = 10) -> None:
"""Submit tensorboard logging to background thread pool."""
video_cpu = video.detach().cpu()
fut = _get_io_executor().submit(_log_to_tb_sync, video_cpu, writer, tag, fps)
_io_futures.append(fut)
def patch_norm_bypass_autocast(): def patch_norm_bypass_autocast():
"""Monkey-patch GroupNorm and LayerNorm to bypass autocast's fp32 policy. """Monkey-patch GroupNorm and LayerNorm to bypass autocast's fp32 policy.
This eliminates bf16->fp32->bf16 dtype conversions during UNet forward.""" This eliminates bf16->fp32->bf16 dtype conversions during UNet forward."""
@@ -263,18 +185,17 @@ def get_filelist(data_dir: str, postfixes: list[str]) -> list[str]:
return file_list return file_list
def load_model_checkpoint(model: nn.Module, ckpt: str, device: str = "cpu") -> nn.Module: def load_model_checkpoint(model: nn.Module, ckpt: str) -> nn.Module:
"""Load model weights from checkpoint file. """Load model weights from checkpoint file.
Args: Args:
model (nn.Module): Model instance. model (nn.Module): Model instance.
ckpt (str): Path to the checkpoint file. ckpt (str): Path to the checkpoint file.
device (str): Target device for loaded tensors.
Returns: Returns:
nn.Module: Model with loaded weights. nn.Module: Model with loaded weights.
""" """
state_dict = torch.load(ckpt, map_location=device) state_dict = torch.load(ckpt, map_location="cpu")
if "state_dict" in list(state_dict.keys()): if "state_dict" in list(state_dict.keys()):
state_dict = state_dict["state_dict"] state_dict = state_dict["state_dict"]
try: try:
@@ -689,40 +610,21 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
# Load config # Load config
config = OmegaConf.load(args.config) config = OmegaConf.load(args.config)
prepared_path = args.ckpt_path + ".prepared.pt"
if os.path.exists(prepared_path):
# ---- Fast path: load the fully-prepared model ----
print(f">>> Loading prepared model from {prepared_path} ...")
model = torch.load(prepared_path,
map_location=f"cuda:{gpu_no}",
weights_only=False)
model.eval()
# Restore autocast attributes (weights already cast, just need contexts)
model.diffusion_autocast_dtype = torch.bfloat16 if args.diffusion_dtype == "bf16" else torch.bfloat16
model.projector_autocast_dtype = torch.bfloat16 if args.projector_mode == "autocast" else None
model.encoder_autocast_dtype = torch.bfloat16 if args.encoder_mode == "autocast" else None
# Compile hot ResBlocks for operator fusion
apply_torch_compile(model)
print(f">>> Prepared model loaded.")
else:
# ---- Normal path: construct + checkpoint + casting ----
config['model']['params']['wma_config']['params'][ config['model']['params']['wma_config']['params'][
'use_checkpoint'] = False 'use_checkpoint'] = False
model = instantiate_from_config(config.model) model = instantiate_from_config(config.model)
model.perframe_ae = args.perframe_ae model.perframe_ae = args.perframe_ae
assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!" assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, args.ckpt_path, model = load_model_checkpoint(model, args.ckpt_path)
device=f"cuda:{gpu_no}")
model.eval() model.eval()
print(f'>>> Load pre-trained model ...') print(f'>>> Load pre-trained model ...')
# Apply precision settings before moving to GPU # Apply precision settings before moving to GPU
model = apply_precision_settings(model, args) model = apply_precision_settings(model, args)
# Compile hot ResBlocks for operator fusion
apply_torch_compile(model)
# Export precision-converted checkpoint if requested # Export precision-converted checkpoint if requested
if args.export_precision_ckpt: if args.export_precision_ckpt:
export_path = args.export_precision_ckpt export_path = args.export_precision_ckpt
@@ -731,21 +633,13 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
print(f">>> Precision-converted checkpoint saved to: {export_path}") print(f">>> Precision-converted checkpoint saved to: {export_path}")
return return
model = model.cuda(gpu_no) # Build unnomalizer
# Save prepared model for fast loading next time (before torch.compile)
print(f">>> Saving prepared model to {prepared_path} ...")
torch.save(model, prepared_path)
print(f">>> Prepared model saved ({os.path.getsize(prepared_path) / 1024**3:.1f} GB).")
# Compile hot ResBlocks for operator fusion (after save, compiled objects can't be pickled)
apply_torch_compile(model)
# Build normalizer (always needed, independent of model loading path)
logging.info("***** Configing Data *****") logging.info("***** Configing Data *****")
data = instantiate_from_config(config.data) data = instantiate_from_config(config.data)
data.setup() data.setup()
print(">>> Dataset is successfully loaded ...") print(">>> Dataset is successfully loaded ...")
model = model.cuda(gpu_no)
device = get_device_from_parameters(model) device = get_device_from_parameters(model)
# Run over data # Run over data
@@ -923,13 +817,13 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
# Save the imagen videos for decision-making # Save the imagen videos for decision-making
if pred_videos_0 is not None: if pred_videos_0 is not None:
sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}" sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
log_to_tensorboard_async(writer, log_to_tensorboard(writer,
pred_videos_0, pred_videos_0,
sample_tag, sample_tag,
fps=args.save_fps) fps=args.save_fps)
# Save videos environment changes via world-model interaction # Save videos environment changes via world-model interaction
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}" sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
log_to_tensorboard_async(writer, log_to_tensorboard(writer,
pred_videos_1, pred_videos_1,
sample_tag, sample_tag,
fps=args.save_fps) fps=args.save_fps)
@@ -937,12 +831,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
# Save the imagen videos for decision-making # Save the imagen videos for decision-making
if pred_videos_0 is not None: if pred_videos_0 is not None:
sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4' sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
save_results_async(pred_videos_0, save_results(pred_videos_0.cpu(),
sample_video_file, sample_video_file,
fps=args.save_fps) fps=args.save_fps)
# Save videos environment changes via world-model interaction # Save videos environment changes via world-model interaction
sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4' sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
save_results_async(pred_videos_1, save_results(pred_videos_1.cpu(),
sample_video_file, sample_video_file,
fps=args.save_fps) fps=args.save_fps)
@@ -952,15 +846,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
full_video = torch.cat(wm_video, dim=2) full_video = torch.cat(wm_video, dim=2)
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full" sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
log_to_tensorboard_async(writer, log_to_tensorboard(writer,
full_video, full_video,
sample_tag, sample_tag,
fps=args.save_fps) fps=args.save_fps)
sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4" sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4"
save_results_async(full_video, sample_full_video_file, fps=args.save_fps) save_results(full_video, sample_full_video_file, fps=args.save_fps)
# Wait for all async I/O to complete
_flush_io()
def get_parser(): def get_parser():

View File

@@ -99,8 +99,6 @@ class AutoencoderKL(pl.LightningModule):
print(f"Restored from {path}") print(f"Restored from {path}")
def encode(self, x, **kwargs): def encode(self, x, **kwargs):
if getattr(self, '_channels_last', False):
x = x.to(memory_format=torch.channels_last)
h = self.encoder(x) h = self.encoder(x)
moments = self.quant_conv(h) moments = self.quant_conv(h)
@@ -108,8 +106,6 @@ class AutoencoderKL(pl.LightningModule):
return posterior return posterior
def decode(self, z, **kwargs): def decode(self, z, **kwargs):
if getattr(self, '_channels_last', False):
z = z.to(memory_format=torch.channels_last)
z = self.post_quant_conv(z) z = self.post_quant_conv(z)
dec = self.decoder(z) dec = self.decoder(z)
return dec return dec

View File

@@ -1074,10 +1074,10 @@ class LatentDiffusion(DDPM):
encoder_posterior = self.first_stage_model.encode(x) encoder_posterior = self.first_stage_model.encode(x)
results = self.get_first_stage_encoding(encoder_posterior).detach() results = self.get_first_stage_encoding(encoder_posterior).detach()
else: ## Consume less GPU memory but slower else: ## Consume less GPU memory but slower
bs = getattr(self, 'vae_encode_bs', 1)
results = [] results = []
for i in range(0, x.shape[0], bs): for index in range(x.shape[0]):
frame_batch = self.first_stage_model.encode(x[i:i + bs]) frame_batch = self.first_stage_model.encode(x[index:index +
1, :, :, :])
frame_result = self.get_first_stage_encoding( frame_result = self.get_first_stage_encoding(
frame_batch).detach() frame_batch).detach()
results.append(frame_result) results.append(frame_result)
@@ -1109,14 +1109,14 @@ class LatentDiffusion(DDPM):
vae_dtype = next(self.first_stage_model.parameters()).dtype vae_dtype = next(self.first_stage_model.parameters()).dtype
z = z.to(dtype=vae_dtype) z = z.to(dtype=vae_dtype)
z = 1. / self.scale_factor * z
if not self.perframe_ae: if not self.perframe_ae:
z = 1. / self.scale_factor * z
results = self.first_stage_model.decode(z, **kwargs) results = self.first_stage_model.decode(z, **kwargs)
else: else:
bs = getattr(self, 'vae_decode_bs', 1)
results = [] results = []
for i in range(0, z.shape[0], bs): for index in range(z.shape[0]):
frame_result = self.first_stage_model.decode(z[i:i + bs], **kwargs) frame_z = 1. / self.scale_factor * z[index:index + 1, :, :, :]
frame_result = self.first_stage_model.decode(frame_z, **kwargs)
results.append(frame_result) results.append(frame_result)
results = torch.cat(results, dim=0) results = torch.cat(results, dim=0)

View File

@@ -1,7 +0,0 @@
{
"permissions": {
"allow": [
"Bash(python3:*)"
]
}
}

View File

@@ -11,7 +11,7 @@ from unifolm_wma.utils.utils import instantiate_from_config
def nonlinearity(x): def nonlinearity(x):
# swish # swish
return torch.nn.functional.silu(x) return x * torch.sigmoid(x)
def Normalize(in_channels, num_groups=32): def Normalize(in_channels, num_groups=32):

View File

@@ -848,15 +848,16 @@ class WMAModel(nn.Module):
if not self.base_model_gen_only: if not self.base_model_gen_only:
ba, _, _ = x_action.shape ba, _, _ = x_action.shape
ts_state = timesteps[:ba] if b > 1 else timesteps
# Run action_unet and state_unet in parallel via CUDA streams
s_stream = torch.cuda.Stream()
s_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s_stream):
s_y = self.state_unet(x_state, ts_state, hs_a,
context_action[:2], **kwargs)
a_y = self.action_unet(x_action, timesteps[:ba], hs_a, a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
context_action[:2], **kwargs) context_action[:2], **kwargs)
# Predict state torch.cuda.current_stream().wait_stream(s_stream)
if b > 1:
s_y = self.state_unet(x_state, timesteps[:ba], hs_a,
context_action[:2], **kwargs)
else:
s_y = self.state_unet(x_state, timesteps, hs_a,
context_action[:2], **kwargs)
else: else:
a_y = torch.zeros_like(x_action) a_y = torch.zeros_like(x_action)
s_y = torch.zeros_like(x_state) s_y = torch.zeros_like(x_state)

View File

@@ -1,14 +1,14 @@
/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
__import__("pkg_resources").declare_namespace(__name__) __import__("pkg_resources").declare_namespace(__name__)
2026-02-09 18:39:50.119842: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2026-02-10 10:36:44.797852: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-09 18:39:50.123128: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2026-02-10 10:36:44.801300: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-02-09 18:39:50.156652: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2026-02-10 10:36:44.837891: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-02-09 18:39:50.156708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2026-02-10 10:36:44.837946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-02-09 18:39:50.158926: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2026-02-10 10:36:44.839880: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-02-09 18:39:50.167779: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used. 2026-02-10 10:36:44.849073: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-02-09 18:39:50.168073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. 2026-02-10 10:36:44.849365: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-09 18:39:50.915144: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT 2026-02-10 10:36:45.644793: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
[rank: 0] Global seed set to 123 [rank: 0] Global seed set to 123
/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead. /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
@@ -116,7 +116,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
DEBUG:PIL.Image:Importing WmfImagePlugin DEBUG:PIL.Image:Importing WmfImagePlugin
DEBUG:PIL.Image:Importing XbmImagePlugin DEBUG:PIL.Image:Importing XbmImagePlugin
DEBUG:PIL.Image:Importing XpmImagePlugin DEBUG:PIL.Image:Importing XpmImagePlugin
DEBUG:PIL.Image:Importing XVThumbImagePlugin DEBUG:PIL.Image:Importing XVThumbImagePlugin
12%|█▎ | 1/8 [01:06<07:46, 66.62s/it] 12%|█▎ | 1/8 [01:06<07:46, 66.62s/it]
25%|██▌ | 2/8 [02:07<06:17, 62.97s/it] 25%|██▌ | 2/8 [02:07<06:17, 62.97s/it]
@@ -140,6 +140,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
>>> Step 4: generating actions ... >>> Step 4: generating actions ...
>>> Step 4: interacting with world model ... >>> Step 4: interacting with world model ...
>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>
>>> Step 5: generating actions ... >>> Step 5: generating actions ...
>>> Step 5: interacting with world model ... >>> Step 5: interacting with world model ...
>>>>>>>>>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>>>>>>>>>>>

View File

@@ -0,0 +1,5 @@
itr,stack_to_device_1,policy/ddim_sampler_init,policy/image_embedding,policy/vae_encode,policy/text_conditioning,policy/projectors,policy/cond_assembly,policy/ddim_sampling,policy/vae_decode,synth_policy,update_action_queue,stack_to_device_2,wm/ddim_sampler_init,wm/image_embedding,wm/vae_encode,wm/text_conditioning,wm/projectors,wm/cond_assembly,wm/ddim_sampling,wm/vae_decode,synth_world_model,update_obs_queue,tensorboard_log,save_results,cpu_transfer,itr_total
0,0.16,0.08,20.98,49.56,14.51,0.29,0.07,31005.48,0.00,31094.51,0.39,0.13,0.09,20.62,48.76,14.17,0.28,0.07,31011.17,775.40,31875.87,0.61,0.31,97.28,7.19,63077.50
1,0.16,0.09,20.97,49.63,14.52,0.30,0.07,31035.49,0.00,31125.16,0.54,0.17,0.14,21.46,49.26,14.88,0.49,0.12,31047.54,777.56,31918.60,0.75,0.60,109.89,6.21,63163.18
2,0.18,0.10,21.44,49.71,15.05,0.34,0.07,31047.64,0.00,31138.56,0.58,0.16,0.13,21.03,48.74,14.69,0.32,0.08,31036.47,776.96,31905.96,0.67,0.39,116.96,7.43,63171.90
3,0.18,0.10,21.38,49.47,15.02,0.35,0.08,31041.05,0.00,31132.03,0.48,0.16,0.12,20.81,49.34,14.41,0.47,0.11,31051.98,777.11,31920.42,0.64,0.38,121.67,7.29,63184.26
1 itr stack_to_device_1 policy/ddim_sampler_init policy/image_embedding policy/vae_encode policy/text_conditioning policy/projectors policy/cond_assembly policy/ddim_sampling policy/vae_decode synth_policy update_action_queue stack_to_device_2 wm/ddim_sampler_init wm/image_embedding wm/vae_encode wm/text_conditioning wm/projectors wm/cond_assembly wm/ddim_sampling wm/vae_decode synth_world_model update_obs_queue tensorboard_log save_results cpu_transfer itr_total
2 0 0.16 0.08 20.98 49.56 14.51 0.29 0.07 31005.48 0.00 31094.51 0.39 0.13 0.09 20.62 48.76 14.17 0.28 0.07 31011.17 775.40 31875.87 0.61 0.31 97.28 7.19 63077.50
3 1 0.16 0.09 20.97 49.63 14.52 0.30 0.07 31035.49 0.00 31125.16 0.54 0.17 0.14 21.46 49.26 14.88 0.49 0.12 31047.54 777.56 31918.60 0.75 0.60 109.89 6.21 63163.18
4 2 0.18 0.10 21.44 49.71 15.05 0.34 0.07 31047.64 0.00 31138.56 0.58 0.16 0.13 21.03 48.74 14.69 0.32 0.08 31036.47 776.96 31905.96 0.67 0.39 116.96 7.43 63171.90
5 3 0.18 0.10 21.38 49.47 15.02 0.35 0.08 31041.05 0.00 31132.03 0.48 0.16 0.12 20.81 49.34 14.41 0.47 0.11 31051.98 777.11 31920.42 0.64 0.38 121.67 7.29 63184.26

View File

@@ -0,0 +1,5 @@
stat,stack_to_device_1,policy/ddim_sampler_init,policy/image_embedding,policy/vae_encode,policy/text_conditioning,policy/projectors,policy/cond_assembly,policy/ddim_sampling,policy/vae_decode,synth_policy,update_action_queue,stack_to_device_2,wm/ddim_sampler_init,wm/image_embedding,wm/vae_encode,wm/text_conditioning,wm/projectors,wm/cond_assembly,wm/ddim_sampling,wm/vae_decode,synth_world_model,update_obs_queue,tensorboard_log,save_results,cpu_transfer,itr_total
mean,0.17,0.09,21.19,49.59,14.78,0.32,0.07,31032.42,0.00,31122.56,0.49,0.15,0.12,20.98,49.03,14.53,0.39,0.10,31036.79,776.76,31905.21,0.67,0.42,111.45,7.03,63149.21
std,0.01,0.01,0.22,0.09,0.26,0.03,0.00,16.13,0.00,16.88,0.07,0.01,0.02,0.31,0.28,0.27,0.09,0.02,15.83,0.82,17.84,0.05,0.11,9.19,0.48,42.08
min,0.16,0.08,20.97,49.47,14.51,0.29,0.07,31005.48,0.00,31094.51,0.39,0.13,0.09,20.62,48.74,14.17,0.28,0.07,31011.17,775.40,31875.87,0.61,0.31,97.28,6.21,63077.50
max,0.18,0.10,21.44,49.71,15.05,0.35,0.08,31047.64,0.00,31138.56,0.58,0.17,0.14,21.46,49.34,14.88,0.49,0.12,31051.98,777.56,31920.42,0.75,0.60,121.67,7.43,63184.26
1 stat stack_to_device_1 policy/ddim_sampler_init policy/image_embedding policy/vae_encode policy/text_conditioning policy/projectors policy/cond_assembly policy/ddim_sampling policy/vae_decode synth_policy update_action_queue stack_to_device_2 wm/ddim_sampler_init wm/image_embedding wm/vae_encode wm/text_conditioning wm/projectors wm/cond_assembly wm/ddim_sampling wm/vae_decode synth_world_model update_obs_queue tensorboard_log save_results cpu_transfer itr_total
2 mean 0.17 0.09 21.19 49.59 14.78 0.32 0.07 31032.42 0.00 31122.56 0.49 0.15 0.12 20.98 49.03 14.53 0.39 0.10 31036.79 776.76 31905.21 0.67 0.42 111.45 7.03 63149.21
3 std 0.01 0.01 0.22 0.09 0.26 0.03 0.00 16.13 0.00 16.88 0.07 0.01 0.02 0.31 0.28 0.27 0.09 0.02 15.83 0.82 17.84 0.05 0.11 9.19 0.48 42.08
4 min 0.16 0.08 20.97 49.47 14.51 0.29 0.07 31005.48 0.00 31094.51 0.39 0.13 0.09 20.62 48.74 14.17 0.28 0.07 31011.17 775.40 31875.87 0.61 0.31 97.28 6.21 63077.50
5 max 0.18 0.10 21.44 49.71 15.05 0.35 0.08 31047.64 0.00 31138.56 0.58 0.17 0.14 21.46 49.34 14.88 0.49 0.12 31051.98 777.56 31920.42 0.75 0.60 121.67 7.43 63184.26

View File

@@ -0,0 +1,45 @@
/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
__import__("pkg_resources").declare_namespace(__name__)
[rank: 0] Global seed set to 123
/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
checkpoint = torch.load(checkpoint_path, map_location=map_location)
/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py:168: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
state_dict = torch.load(args.ckpt_path, map_location="cpu")
============================================================
PROFILE ITERATION — Loading model...
============================================================
AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.
torch.compile: 3 ResBlocks in output_blocks[5, 8, 9]
>>> Model loaded and ready.
>>> Noise shape: [1, 4, 16, 40, 64]
>>> DDIM steps: 50
>>> fast_policy_no_decode: True
============================================================
LAYER 1: ITERATION-LEVEL PROFILING
============================================================
>>> unitree_z1_stackbox: 1 data samples loaded.
>>> unitree_z1_stackbox: data stats loaded.
>>> unitree_z1_stackbox: normalizer initiated.
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
>>> unitree_g1_pack_camera: 1 data samples loaded.
>>> unitree_g1_pack_camera: data stats loaded.
>>> unitree_g1_pack_camera: normalizer initiated.
>>> Running 5 profiled iterations ...
Traceback (most recent call last):
File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 981, in <module>
main()
File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 967, in main
all_records = run_profiled_iterations(
File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 502, in run_profiled_iterations
sampler_type=args.sampler_type)
AttributeError: 'Namespace' object has no attribute 'sampler_type'

View File

@@ -4,7 +4,7 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
{ {
time TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \ time TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
--seed 123 \ --seed 123 \
--ckpt_path ckpts/unifolm_wma_dual_mixbf16.ckpt \ --ckpt_path ckpts/unifolm_wma_dual_mix_bf16.ckpt \
--config configs/inference/world_model_interaction.yaml \ --config configs/inference/world_model_interaction.yaml \
--savedir "${res_dir}/output" \ --savedir "${res_dir}/output" \
--bs 1 --height 320 --width 512 \ --bs 1 --height 320 --width 512 \
@@ -21,9 +21,6 @@ dataset="unitree_z1_dual_arm_cleanup_pencils"
--timestep_spacing 'uniform_trailing' \ --timestep_spacing 'uniform_trailing' \
--guidance_rescale 0.7 \ --guidance_rescale 0.7 \
--perframe_ae \ --perframe_ae \
--diffusion_dtype fp32 \ --vae_dtype bf16 \
--projector_mode fp32 \
--encoder_mode fp32 \
--vae_dtype fp32 \
--fast_policy_no_decode --fast_policy_no_decode
} 2>&1 | tee "${res_dir}/output.log" } 2>&1 | tee "${res_dir}/output.log"