添加CrossAttention kv缓存,减少重复计算,提升性能,psnr=25.1201dB
This commit is contained in:
10
.claude/settings.local.json
Normal file
10
.claude/settings.local.json
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(conda env list:*)",
|
||||||
|
"Bash(mamba env:*)",
|
||||||
|
"Bash(micromamba env list:*)",
|
||||||
|
"Bash(echo:*)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -120,7 +120,7 @@ localTest/
|
|||||||
fig/
|
fig/
|
||||||
figure/
|
figure/
|
||||||
*.mp4
|
*.mp4
|
||||||
*.json
|
|
||||||
Data/ControlVAE.yml
|
Data/ControlVAE.yml
|
||||||
Data/Misc
|
Data/Misc
|
||||||
Data/Pretrained
|
Data/Pretrained
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from unifolm_wma.utils.diffusion import make_ddim_sampling_parameters, make_ddim
|
|||||||
from unifolm_wma.utils.common import noise_like
|
from unifolm_wma.utils.common import noise_like
|
||||||
from unifolm_wma.utils.common import extract_into_tensor
|
from unifolm_wma.utils.common import extract_into_tensor
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from unifolm_wma.modules.attention import enable_cross_attn_kv_cache, disable_cross_attn_kv_cache
|
||||||
|
|
||||||
|
|
||||||
class DDIMSampler(object):
|
class DDIMSampler(object):
|
||||||
@@ -243,63 +244,67 @@ class DDIMSampler(object):
|
|||||||
dp_ddim_scheduler_action.set_timesteps(len(timesteps))
|
dp_ddim_scheduler_action.set_timesteps(len(timesteps))
|
||||||
dp_ddim_scheduler_state.set_timesteps(len(timesteps))
|
dp_ddim_scheduler_state.set_timesteps(len(timesteps))
|
||||||
ts = torch.empty((b, ), device=device, dtype=torch.long)
|
ts = torch.empty((b, ), device=device, dtype=torch.long)
|
||||||
for i, step in enumerate(iterator):
|
enable_cross_attn_kv_cache(self.model)
|
||||||
index = total_steps - i - 1
|
try:
|
||||||
ts.fill_(step)
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts.fill_(step)
|
||||||
|
|
||||||
# Use mask to blend noised original latent (img_orig) & new sampled latent (img)
|
# Use mask to blend noised original latent (img_orig) & new sampled latent (img)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
assert x0 is not None
|
assert x0 is not None
|
||||||
if clean_cond:
|
if clean_cond:
|
||||||
img_orig = x0
|
img_orig = x0
|
||||||
else:
|
else:
|
||||||
img_orig = self.model.q_sample(x0, ts)
|
img_orig = self.model.q_sample(x0, ts)
|
||||||
img = img_orig * mask + (1. - mask) * img
|
img = img_orig * mask + (1. - mask) * img
|
||||||
|
|
||||||
outs = self.p_sample_ddim(
|
outs = self.p_sample_ddim(
|
||||||
img,
|
img,
|
||||||
action,
|
action,
|
||||||
state,
|
state,
|
||||||
cond,
|
cond,
|
||||||
ts,
|
ts,
|
||||||
index=index,
|
index=index,
|
||||||
use_original_steps=ddim_use_original_steps,
|
use_original_steps=ddim_use_original_steps,
|
||||||
quantize_denoised=quantize_denoised,
|
quantize_denoised=quantize_denoised,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
noise_dropout=noise_dropout,
|
noise_dropout=noise_dropout,
|
||||||
score_corrector=score_corrector,
|
score_corrector=score_corrector,
|
||||||
corrector_kwargs=corrector_kwargs,
|
corrector_kwargs=corrector_kwargs,
|
||||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
unconditional_conditioning=unconditional_conditioning,
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
x0=x0,
|
x0=x0,
|
||||||
fs=fs,
|
fs=fs,
|
||||||
guidance_rescale=guidance_rescale,
|
guidance_rescale=guidance_rescale,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
img, pred_x0, model_output_action, model_output_state = outs
|
img, pred_x0, model_output_action, model_output_state = outs
|
||||||
|
|
||||||
action = dp_ddim_scheduler_action.step(
|
action = dp_ddim_scheduler_action.step(
|
||||||
model_output_action,
|
model_output_action,
|
||||||
step,
|
step,
|
||||||
action,
|
action,
|
||||||
generator=None,
|
generator=None,
|
||||||
).prev_sample
|
).prev_sample
|
||||||
state = dp_ddim_scheduler_state.step(
|
state = dp_ddim_scheduler_state.step(
|
||||||
model_output_state,
|
model_output_state,
|
||||||
step,
|
step,
|
||||||
state,
|
state,
|
||||||
generator=None,
|
generator=None,
|
||||||
).prev_sample
|
).prev_sample
|
||||||
|
|
||||||
if callback: callback(i)
|
if callback: callback(i)
|
||||||
if img_callback: img_callback(pred_x0, i)
|
if img_callback: img_callback(pred_x0, i)
|
||||||
|
|
||||||
if index % log_every_t == 0 or index == total_steps - 1:
|
if index % log_every_t == 0 or index == total_steps - 1:
|
||||||
intermediates['x_inter'].append(img)
|
intermediates['x_inter'].append(img)
|
||||||
intermediates['pred_x0'].append(pred_x0)
|
intermediates['pred_x0'].append(pred_x0)
|
||||||
intermediates['x_inter_action'].append(action)
|
intermediates['x_inter_action'].append(action)
|
||||||
intermediates['x_inter_state'].append(state)
|
intermediates['x_inter_state'].append(state)
|
||||||
|
finally:
|
||||||
|
disable_cross_attn_kv_cache(self.model)
|
||||||
|
|
||||||
return img, action, state, intermediates
|
return img, action, state, intermediates
|
||||||
|
|
||||||
|
|||||||
@@ -98,6 +98,9 @@ class CrossAttention(nn.Module):
|
|||||||
self.text_context_len = text_context_len
|
self.text_context_len = text_context_len
|
||||||
self.agent_state_context_len = agent_state_context_len
|
self.agent_state_context_len = agent_state_context_len
|
||||||
self.agent_action_context_len = agent_action_context_len
|
self.agent_action_context_len = agent_action_context_len
|
||||||
|
self._kv_cache = {}
|
||||||
|
self._kv_cache_enabled = False
|
||||||
|
|
||||||
self.cross_attention_scale_learnable = cross_attention_scale_learnable
|
self.cross_attention_scale_learnable = cross_attention_scale_learnable
|
||||||
if self.image_cross_attention:
|
if self.image_cross_attention:
|
||||||
self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
|
self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
@@ -236,17 +239,42 @@ class CrossAttention(nn.Module):
|
|||||||
k_ip, v_ip, out_ip = None, None, None
|
k_ip, v_ip, out_ip = None, None, None
|
||||||
k_as, v_as, out_as = None, None, None
|
k_as, v_as, out_as = None, None, None
|
||||||
k_aa, v_aa, out_aa = None, None, None
|
k_aa, v_aa, out_aa = None, None, None
|
||||||
|
attn_mask_aa = None
|
||||||
|
|
||||||
|
h = self.heads
|
||||||
q = self.to_q(x)
|
q = self.to_q(x)
|
||||||
context = default(context, x)
|
context = default(context, x)
|
||||||
|
|
||||||
if self.image_cross_attention and not spatial_self_attn:
|
b, _, _ = q.shape
|
||||||
|
q = q.unsqueeze(3).reshape(b, q.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, q.shape[1], self.dim_head).contiguous()
|
||||||
|
|
||||||
|
def _reshape_kv(t):
|
||||||
|
return t.unsqueeze(3).reshape(b, t.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, t.shape[1], self.dim_head).contiguous()
|
||||||
|
|
||||||
|
use_cache = self._kv_cache_enabled and not spatial_self_attn
|
||||||
|
cache_hit = use_cache and len(self._kv_cache) > 0
|
||||||
|
|
||||||
|
if cache_hit:
|
||||||
|
k = self._kv_cache['k']
|
||||||
|
v = self._kv_cache['v']
|
||||||
|
k_ip = self._kv_cache.get('k_ip')
|
||||||
|
v_ip = self._kv_cache.get('v_ip')
|
||||||
|
k_as = self._kv_cache.get('k_as')
|
||||||
|
v_as = self._kv_cache.get('v_as')
|
||||||
|
k_aa = self._kv_cache.get('k_aa')
|
||||||
|
v_aa = self._kv_cache.get('v_aa')
|
||||||
|
attn_mask_aa = self._kv_cache.get('attn_mask_aa')
|
||||||
|
elif self.image_cross_attention and not spatial_self_attn:
|
||||||
if context.shape[1] == self.text_context_len + self.video_length:
|
if context.shape[1] == self.text_context_len + self.video_length:
|
||||||
context_ins, context_image = context[:, :self.text_context_len, :], context[:,self.text_context_len:, :]
|
context_ins, context_image = context[:, :self.text_context_len, :], context[:,self.text_context_len:, :]
|
||||||
k = self.to_k(context)
|
k = self.to_k(context)
|
||||||
v = self.to_v(context)
|
v = self.to_v(context)
|
||||||
k_ip = self.to_k_ip(context_image)
|
k_ip = self.to_k_ip(context_image)
|
||||||
v_ip = self.to_v_ip(context_image)
|
v_ip = self.to_v_ip(context_image)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip}
|
||||||
elif context.shape[1] == self.agent_state_context_len + self.text_context_len + self.video_length:
|
elif context.shape[1] == self.agent_state_context_len + self.text_context_len + self.video_length:
|
||||||
context_agent_state = context[:, :self.agent_state_context_len, :]
|
context_agent_state = context[:, :self.agent_state_context_len, :]
|
||||||
context_ins = context[:, self.agent_state_context_len:self.agent_state_context_len+self.text_context_len, :]
|
context_ins = context[:, self.agent_state_context_len:self.agent_state_context_len+self.text_context_len, :]
|
||||||
@@ -257,6 +285,11 @@ class CrossAttention(nn.Module):
|
|||||||
v_ip = self.to_v_ip(context_image)
|
v_ip = self.to_v_ip(context_image)
|
||||||
k_as = self.to_k_as(context_agent_state)
|
k_as = self.to_k_as(context_agent_state)
|
||||||
v_as = self.to_v_as(context_agent_state)
|
v_as = self.to_v_as(context_agent_state)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
|
k_as, v_as = map(_reshape_kv, (k_as, v_as))
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip, 'k_as': k_as, 'v_as': v_as}
|
||||||
else:
|
else:
|
||||||
context_agent_state = context[:, :self.agent_state_context_len, :]
|
context_agent_state = context[:, :self.agent_state_context_len, :]
|
||||||
context_agent_action = context[:, self.agent_state_context_len:self.agent_state_context_len+self.agent_action_context_len, :]
|
context_agent_action = context[:, self.agent_state_context_len:self.agent_state_context_len+self.agent_action_context_len, :]
|
||||||
@@ -272,99 +305,78 @@ class CrossAttention(nn.Module):
|
|||||||
k_aa = self.to_k_aa(context_agent_action)
|
k_aa = self.to_k_aa(context_agent_action)
|
||||||
v_aa = self.to_v_aa(context_agent_action)
|
v_aa = self.to_v_aa(context_agent_action)
|
||||||
|
|
||||||
attn_mask_aa = self._get_attn_mask_aa(x.shape[0],
|
k, v = map(_reshape_kv, (k, v))
|
||||||
q.shape[1],
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
k_aa.shape[1],
|
k_as, v_as = map(_reshape_kv, (k_as, v_as))
|
||||||
block_size=16,
|
k_aa, v_aa = map(_reshape_kv, (k_aa, v_aa))
|
||||||
device=k_aa.device)
|
|
||||||
|
attn_mask_aa_raw = self._get_attn_mask_aa(x.shape[0],
|
||||||
|
q.shape[1],
|
||||||
|
k_aa.shape[1],
|
||||||
|
block_size=16,
|
||||||
|
device=k_aa.device)
|
||||||
|
attn_mask_aa = attn_mask_aa_raw.unsqueeze(1).repeat(1, h, 1, 1).reshape(
|
||||||
|
b * h, attn_mask_aa_raw.shape[1], attn_mask_aa_raw.shape[2]).to(q.dtype)
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {
|
||||||
|
'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip,
|
||||||
|
'k_as': k_as, 'v_as': v_as, 'k_aa': k_aa, 'v_aa': v_aa,
|
||||||
|
'attn_mask_aa': attn_mask_aa,
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
if not spatial_self_attn:
|
if not spatial_self_attn:
|
||||||
assert 1 > 2, ">>> ERROR: you should never go into here ..."
|
assert 1 > 2, ">>> ERROR: you should never go into here ..."
|
||||||
context = context[:, :self.text_context_len, :]
|
context = context[:, :self.text_context_len, :]
|
||||||
k = self.to_k(context)
|
k = self.to_k(context)
|
||||||
v = self.to_v(context)
|
v = self.to_v(context)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
b, _, _ = q.shape
|
if use_cache:
|
||||||
q = q.unsqueeze(3).reshape(b, q.shape[1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(b * self.heads, q.shape[1], self.dim_head).contiguous()
|
self._kv_cache = {'k': k, 'v': v}
|
||||||
if k is not None:
|
if k is not None:
|
||||||
k, v = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(),
|
|
||||||
(k, v),
|
|
||||||
)
|
|
||||||
out = xformers.ops.memory_efficient_attention(q,
|
out = xformers.ops.memory_efficient_attention(q,
|
||||||
k,
|
k,
|
||||||
v,
|
v,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out = (out.unsqueeze(0).reshape(
|
out = (out.unsqueeze(0).reshape(
|
||||||
b, self.heads, out.shape[1],
|
b, h, out.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out.shape[1],
|
3).reshape(b, out.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_ip is not None:
|
if k_ip is not None:
|
||||||
# For image cross-attention
|
|
||||||
k_ip, v_ip = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_ip, v_ip),
|
|
||||||
)
|
|
||||||
out_ip = xformers.ops.memory_efficient_attention(q,
|
out_ip = xformers.ops.memory_efficient_attention(q,
|
||||||
k_ip,
|
k_ip,
|
||||||
v_ip,
|
v_ip,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out_ip = (out_ip.unsqueeze(0).reshape(
|
out_ip = (out_ip.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_ip.shape[1],
|
b, h, out_ip.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_ip.shape[1],
|
3).reshape(b, out_ip.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_as is not None:
|
if k_as is not None:
|
||||||
# For agent state cross-attention
|
|
||||||
k_as, v_as = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_as, v_as),
|
|
||||||
)
|
|
||||||
out_as = xformers.ops.memory_efficient_attention(q,
|
out_as = xformers.ops.memory_efficient_attention(q,
|
||||||
k_as,
|
k_as,
|
||||||
v_as,
|
v_as,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out_as = (out_as.unsqueeze(0).reshape(
|
out_as = (out_as.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_as.shape[1],
|
b, h, out_as.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_as.shape[1],
|
3).reshape(b, out_as.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_aa is not None:
|
if k_aa is not None:
|
||||||
# For agent action cross-attention
|
|
||||||
k_aa, v_aa = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_aa, v_aa),
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_mask_aa = attn_mask_aa.unsqueeze(1).repeat(1,self.heads,1,1).reshape(
|
|
||||||
b * self.heads, attn_mask_aa.shape[1], attn_mask_aa.shape[2])
|
|
||||||
attn_mask_aa = attn_mask_aa.to(q.dtype)
|
|
||||||
|
|
||||||
out_aa = xformers.ops.memory_efficient_attention(
|
out_aa = xformers.ops.memory_efficient_attention(
|
||||||
q, k_aa, v_aa, attn_bias=attn_mask_aa, op=None)
|
q, k_aa, v_aa, attn_bias=attn_mask_aa, op=None)
|
||||||
|
|
||||||
out_aa = (out_aa.unsqueeze(0).reshape(
|
out_aa = (out_aa.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_aa.shape[1],
|
b, h, out_aa.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_aa.shape[1],
|
3).reshape(b, out_aa.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
if exists(mask):
|
if exists(mask):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -410,6 +422,20 @@ class CrossAttention(nn.Module):
|
|||||||
return attn_mask
|
return attn_mask
|
||||||
|
|
||||||
|
|
||||||
|
def enable_cross_attn_kv_cache(module):
|
||||||
|
for m in module.modules():
|
||||||
|
if isinstance(m, CrossAttention):
|
||||||
|
m._kv_cache_enabled = True
|
||||||
|
m._kv_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def disable_cross_attn_kv_cache(module):
|
||||||
|
for m in module.modules():
|
||||||
|
if isinstance(m, CrossAttention):
|
||||||
|
m._kv_cache_enabled = False
|
||||||
|
m._kv_cache = {}
|
||||||
|
|
||||||
|
|
||||||
class BasicTransformerBlock(nn.Module):
|
class BasicTransformerBlock(nn.Module):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_cleanup_pencils/case1/unitree_z1_dual_arm_cleanup_pencils_case1.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
|
||||||
|
"psnr": 47.911564449209735
|
||||||
|
}
|
||||||
@@ -1,10 +1,10 @@
|
|||||||
2026-02-10 17:03:42.057881: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
2026-02-10 17:25:35.484333: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
2026-02-10 17:03:42.107520: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
2026-02-10 17:25:35.533963: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
2026-02-10 17:03:42.107564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
2026-02-10 17:25:35.534009: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
2026-02-10 17:03:42.108900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
2026-02-10 17:25:35.535311: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
2026-02-10 17:03:42.116404: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
2026-02-10 17:25:35.542814: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
2026-02-10 17:03:43.044539: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
2026-02-10 17:25:36.471650: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
Global seed set to 123
|
Global seed set to 123
|
||||||
INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode
|
INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode
|
||||||
INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
|
INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
|
||||||
@@ -92,7 +92,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
|
|||||||
DEBUG:PIL.Image:Importing WmfImagePlugin
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
DEBUG:PIL.Image:Importing XbmImagePlugin
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
DEBUG:PIL.Image:Importing XpmImagePlugin
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
9%|▉ | 1/11 [00:36<06:07, 36.77s/it]
|
9%|▉ | 1/11 [00:36<06:07, 36.77s/it]
|
||||||
18%|█▊ | 2/11 [01:13<05:32, 36.92s/it]
|
18%|█▊ | 2/11 [01:13<05:32, 36.92s/it]
|
||||||
@@ -125,6 +125,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
|||||||
>>> Step 6: generating actions ...
|
>>> Step 6: generating actions ...
|
||||||
>>> Step 6: interacting with world model ...
|
>>> Step 6: interacting with world model ...
|
||||||
>>>>>>>>>>>>>>>>>>>>>>>>
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
>>> Step 7: generating actions ...
|
>>> Step 7: generating actions ...
|
||||||
>>> Step 7: interacting with world model ...
|
>>> Step 7: interacting with world model ...
|
||||||
>>>>>>>>>>>>>>>>>>>>>>>>
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
|||||||
5
unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
Normal file
5
unitree_z1_dual_arm_stackbox_v2/case1/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/unitree_z1_dual_arm_stackbox_v2_case1.mp4",
|
||||||
|
"pred_video": "/home/qhy/unifolm-world-model-action/unitree_z1_dual_arm_stackbox_v2/case1/output/inference/5_full_fs4.mp4",
|
||||||
|
"psnr": 25.12008483689618
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user