Compare commits
14 Commits
main
...
trt-replac
| Author | SHA1 | Date | |
|---|---|---|---|
| 68d695d81d | |||
| 65788be1b3 | |||
| 9a08e27a19 | |||
| b558856e1e | |||
| dcbcb2c377 | |||
| ff43432ef9 | |||
| afa12ba031 | |||
| bf4d66c874 | |||
| 9347a4ebe5 | |||
| 223a50f9e0 | |||
| 2a6068f9e4 | |||
| 91a9b0febc | |||
| ed637c972b | |||
| fffc5a9956 |
15
.claude/settings.local.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(conda env list:*)",
|
||||||
|
"Bash(mamba env:*)",
|
||||||
|
"Bash(micromamba env list:*)",
|
||||||
|
"Bash(echo:*)",
|
||||||
|
"Bash(git show:*)",
|
||||||
|
"Bash(nvidia-smi:*)",
|
||||||
|
"Bash(conda activate unifolm-wma)",
|
||||||
|
"Bash(conda info:*)",
|
||||||
|
"Bash(direnv allow:*)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
2
.envrc
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
eval "$(conda shell.bash hook 2>/dev/null)"
|
||||||
|
conda activate unifolm-wma
|
||||||
10
.gitignore
vendored
@@ -55,7 +55,6 @@ coverage.xml
|
|||||||
*.pot
|
*.pot
|
||||||
|
|
||||||
# Django stuff:
|
# Django stuff:
|
||||||
*.log
|
|
||||||
local_settings.py
|
local_settings.py
|
||||||
db.sqlite3
|
db.sqlite3
|
||||||
|
|
||||||
@@ -121,10 +120,17 @@ localTest/
|
|||||||
fig/
|
fig/
|
||||||
figure/
|
figure/
|
||||||
*.mp4
|
*.mp4
|
||||||
*.json
|
|
||||||
Data/ControlVAE.yml
|
Data/ControlVAE.yml
|
||||||
Data/Misc
|
Data/Misc
|
||||||
Data/Pretrained
|
Data/Pretrained
|
||||||
Data/utils.py
|
Data/utils.py
|
||||||
Experiment/checkpoint
|
Experiment/checkpoint
|
||||||
Experiment/log
|
Experiment/log
|
||||||
|
|
||||||
|
*.ckpt
|
||||||
|
|
||||||
|
*.0
|
||||||
|
ckpts/unifolm_wma_dual.ckpt.prepared.pt
|
||||||
|
trt_engines/video_backbone.engine
|
||||||
|
trt_engines/video_backbone.onnx
|
||||||
|
|||||||
439
ckpts/LICENSE
Normal file
@@ -0,0 +1,439 @@
|
|||||||
|
Attribution-NonCommercial-ShareAlike 4.0 International
|
||||||
|
|
||||||
|
Copyright (c) 2016-2025 HangZhou YuShu TECHNOLOGY CO.,LTD. ("Unitree Robotics")
|
||||||
|
|
||||||
|
=======================================================================
|
||||||
|
|
||||||
|
Creative Commons Corporation ("Creative Commons") is not a law firm and
|
||||||
|
does not provide legal services or legal advice. Distribution of
|
||||||
|
Creative Commons public licenses does not create a lawyer-client or
|
||||||
|
other relationship. Creative Commons makes its licenses and related
|
||||||
|
information available on an "as-is" basis. Creative Commons gives no
|
||||||
|
warranties regarding its licenses, any material licensed under their
|
||||||
|
terms and conditions, or any related information. Creative Commons
|
||||||
|
disclaims all liability for damages resulting from their use to the
|
||||||
|
fullest extent possible.
|
||||||
|
|
||||||
|
Using Creative Commons Public Licenses
|
||||||
|
|
||||||
|
Creative Commons public licenses provide a standard set of terms and
|
||||||
|
conditions that creators and other rights holders may use to share
|
||||||
|
original works of authorship and other material subject to copyright
|
||||||
|
and certain other rights specified in the public license below. The
|
||||||
|
following considerations are for informational purposes only, are not
|
||||||
|
exhaustive, and do not form part of our licenses.
|
||||||
|
|
||||||
|
Considerations for licensors: Our public licenses are
|
||||||
|
intended for use by those authorized to give the public
|
||||||
|
permission to use material in ways otherwise restricted by
|
||||||
|
copyright and certain other rights. Our licenses are
|
||||||
|
irrevocable. Licensors should read and understand the terms
|
||||||
|
and conditions of the license they choose before applying it.
|
||||||
|
Licensors should also secure all rights necessary before
|
||||||
|
applying our licenses so that the public can reuse the
|
||||||
|
material as expected. Licensors should clearly mark any
|
||||||
|
material not subject to the license. This includes other CC-
|
||||||
|
licensed material, or material used under an exception or
|
||||||
|
limitation to copyright. More considerations for licensors:
|
||||||
|
wiki.creativecommons.org/Considerations_for_licensors
|
||||||
|
|
||||||
|
Considerations for the public: By using one of our public
|
||||||
|
licenses, a licensor grants the public permission to use the
|
||||||
|
licensed material under specified terms and conditions. If
|
||||||
|
the licensor's permission is not necessary for any reason--for
|
||||||
|
example, because of any applicable exception or limitation to
|
||||||
|
copyright--then that use is not regulated by the license. Our
|
||||||
|
licenses grant only permissions under copyright and certain
|
||||||
|
other rights that a licensor has authority to grant. Use of
|
||||||
|
the licensed material may still be restricted for other
|
||||||
|
reasons, including because others have copyright or other
|
||||||
|
rights in the material. A licensor may make special requests,
|
||||||
|
such as asking that all changes be marked or described.
|
||||||
|
Although not required by our licenses, you are encouraged to
|
||||||
|
respect those requests where reasonable. More considerations
|
||||||
|
for the public:
|
||||||
|
wiki.creativecommons.org/Considerations_for_licensees
|
||||||
|
|
||||||
|
=======================================================================
|
||||||
|
|
||||||
|
Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
|
||||||
|
Public License
|
||||||
|
|
||||||
|
By exercising the Licensed Rights (defined below), You accept and agree
|
||||||
|
to be bound by the terms and conditions of this Creative Commons
|
||||||
|
Attribution-NonCommercial-ShareAlike 4.0 International Public License
|
||||||
|
("Public License"). To the extent this Public License may be
|
||||||
|
interpreted as a contract, You are granted the Licensed Rights in
|
||||||
|
consideration of Your acceptance of these terms and conditions, and the
|
||||||
|
Licensor grants You such rights in consideration of benefits the
|
||||||
|
Licensor receives from making the Licensed Material available under
|
||||||
|
these terms and conditions.
|
||||||
|
|
||||||
|
|
||||||
|
Section 1 -- Definitions.
|
||||||
|
|
||||||
|
a. Adapted Material means material subject to Copyright and Similar
|
||||||
|
Rights that is derived from or based upon the Licensed Material
|
||||||
|
and in which the Licensed Material is translated, altered,
|
||||||
|
arranged, transformed, or otherwise modified in a manner requiring
|
||||||
|
permission under the Copyright and Similar Rights held by the
|
||||||
|
Licensor. For purposes of this Public License, where the Licensed
|
||||||
|
Material is a musical work, performance, or sound recording,
|
||||||
|
Adapted Material is always produced where the Licensed Material is
|
||||||
|
synched in timed relation with a moving image.
|
||||||
|
|
||||||
|
b. Adapter's License means the license You apply to Your Copyright
|
||||||
|
and Similar Rights in Your contributions to Adapted Material in
|
||||||
|
accordance with the terms and conditions of this Public License.
|
||||||
|
|
||||||
|
c. BY-NC-SA Compatible License means a license listed at
|
||||||
|
creativecommons.org/compatiblelicenses, approved by Creative
|
||||||
|
Commons as essentially the equivalent of this Public License.
|
||||||
|
|
||||||
|
d. Copyright and Similar Rights means copyright and/or similar rights
|
||||||
|
closely related to copyright including, without limitation,
|
||||||
|
performance, broadcast, sound recording, and Sui Generis Database
|
||||||
|
Rights, without regard to how the rights are labeled or
|
||||||
|
categorized. For purposes of this Public License, the rights
|
||||||
|
specified in Section 2(b)(1)-(2) are not Copyright and Similar
|
||||||
|
Rights.
|
||||||
|
|
||||||
|
e. Effective Technological Measures means those measures that, in the
|
||||||
|
absence of proper authority, may not be circumvented under laws
|
||||||
|
fulfilling obligations under Article 11 of the WIPO Copyright
|
||||||
|
Treaty adopted on December 20, 1996, and/or similar international
|
||||||
|
agreements.
|
||||||
|
|
||||||
|
f. Exceptions and Limitations means fair use, fair dealing, and/or
|
||||||
|
any other exception or limitation to Copyright and Similar Rights
|
||||||
|
that applies to Your use of the Licensed Material.
|
||||||
|
|
||||||
|
g. License Elements means the license attributes listed in the name
|
||||||
|
of a Creative Commons Public License. The License Elements of this
|
||||||
|
Public License are Attribution, NonCommercial, and ShareAlike.
|
||||||
|
|
||||||
|
h. Licensed Material means the artistic or literary work, database,
|
||||||
|
or other material to which the Licensor applied this Public
|
||||||
|
License.
|
||||||
|
|
||||||
|
i. Licensed Rights means the rights granted to You subject to the
|
||||||
|
terms and conditions of this Public License, which are limited to
|
||||||
|
all Copyright and Similar Rights that apply to Your use of the
|
||||||
|
Licensed Material and that the Licensor has authority to license.
|
||||||
|
|
||||||
|
j. Licensor means the individual(s) or entity(ies) granting rights
|
||||||
|
under this Public License.
|
||||||
|
|
||||||
|
k. NonCommercial means not primarily intended for or directed towards
|
||||||
|
commercial advantage or monetary compensation. For purposes of
|
||||||
|
this Public License, the exchange of the Licensed Material for
|
||||||
|
other material subject to Copyright and Similar Rights by digital
|
||||||
|
file-sharing or similar means is NonCommercial provided there is
|
||||||
|
no payment of monetary compensation in connection with the
|
||||||
|
exchange.
|
||||||
|
|
||||||
|
l. Share means to provide material to the public by any means or
|
||||||
|
process that requires permission under the Licensed Rights, such
|
||||||
|
as reproduction, public display, public performance, distribution,
|
||||||
|
dissemination, communication, or importation, and to make material
|
||||||
|
available to the public including in ways that members of the
|
||||||
|
public may access the material from a place and at a time
|
||||||
|
individually chosen by them.
|
||||||
|
|
||||||
|
m. Sui Generis Database Rights means rights other than copyright
|
||||||
|
resulting from Directive 96/9/EC of the European Parliament and of
|
||||||
|
the Council of 11 March 1996 on the legal protection of databases,
|
||||||
|
as amended and/or succeeded, as well as other essentially
|
||||||
|
equivalent rights anywhere in the world.
|
||||||
|
|
||||||
|
n. You means the individual or entity exercising the Licensed Rights
|
||||||
|
under this Public License. Your has a corresponding meaning.
|
||||||
|
|
||||||
|
|
||||||
|
Section 2 -- Scope.
|
||||||
|
|
||||||
|
a. License grant.
|
||||||
|
|
||||||
|
1. Subject to the terms and conditions of this Public License,
|
||||||
|
the Licensor hereby grants You a worldwide, royalty-free,
|
||||||
|
non-sublicensable, non-exclusive, irrevocable license to
|
||||||
|
exercise the Licensed Rights in the Licensed Material to:
|
||||||
|
|
||||||
|
a. reproduce and Share the Licensed Material, in whole or
|
||||||
|
in part, for NonCommercial purposes only; and
|
||||||
|
|
||||||
|
b. produce, reproduce, and Share Adapted Material for
|
||||||
|
NonCommercial purposes only.
|
||||||
|
|
||||||
|
2. Exceptions and Limitations. For the avoidance of doubt, where
|
||||||
|
Exceptions and Limitations apply to Your use, this Public
|
||||||
|
License does not apply, and You do not need to comply with
|
||||||
|
its terms and conditions.
|
||||||
|
|
||||||
|
3. Term. The term of this Public License is specified in Section
|
||||||
|
6(a).
|
||||||
|
|
||||||
|
4. Media and formats; technical modifications allowed. The
|
||||||
|
Licensor authorizes You to exercise the Licensed Rights in
|
||||||
|
all media and formats whether now known or hereafter created,
|
||||||
|
and to make technical modifications necessary to do so. The
|
||||||
|
Licensor waives and/or agrees not to assert any right or
|
||||||
|
authority to forbid You from making technical modifications
|
||||||
|
necessary to exercise the Licensed Rights, including
|
||||||
|
technical modifications necessary to circumvent Effective
|
||||||
|
Technological Measures. For purposes of this Public License,
|
||||||
|
simply making modifications authorized by this Section 2(a)
|
||||||
|
(4) never produces Adapted Material.
|
||||||
|
|
||||||
|
5. Downstream recipients.
|
||||||
|
|
||||||
|
a. Offer from the Licensor -- Licensed Material. Every
|
||||||
|
recipient of the Licensed Material automatically
|
||||||
|
receives an offer from the Licensor to exercise the
|
||||||
|
Licensed Rights under the terms and conditions of this
|
||||||
|
Public License.
|
||||||
|
|
||||||
|
b. Additional offer from the Licensor -- Adapted Material.
|
||||||
|
Every recipient of Adapted Material from You
|
||||||
|
automatically receives an offer from the Licensor to
|
||||||
|
exercise the Licensed Rights in the Adapted Material
|
||||||
|
under the conditions of the Adapter's License You apply.
|
||||||
|
|
||||||
|
c. No downstream restrictions. You may not offer or impose
|
||||||
|
any additional or different terms or conditions on, or
|
||||||
|
apply any Effective Technological Measures to, the
|
||||||
|
Licensed Material if doing so restricts exercise of the
|
||||||
|
Licensed Rights by any recipient of the Licensed
|
||||||
|
Material.
|
||||||
|
|
||||||
|
6. No endorsement. Nothing in this Public License constitutes or
|
||||||
|
may be construed as permission to assert or imply that You
|
||||||
|
are, or that Your use of the Licensed Material is, connected
|
||||||
|
with, or sponsored, endorsed, or granted official status by,
|
||||||
|
the Licensor or others designated to receive attribution as
|
||||||
|
provided in Section 3(a)(1)(A)(i).
|
||||||
|
|
||||||
|
b. Other rights.
|
||||||
|
|
||||||
|
1. Moral rights, such as the right of integrity, are not
|
||||||
|
licensed under this Public License, nor are publicity,
|
||||||
|
privacy, and/or other similar personality rights; however, to
|
||||||
|
the extent possible, the Licensor waives and/or agrees not to
|
||||||
|
assert any such rights held by the Licensor to the limited
|
||||||
|
extent necessary to allow You to exercise the Licensed
|
||||||
|
Rights, but not otherwise.
|
||||||
|
|
||||||
|
2. Patent and trademark rights are not licensed under this
|
||||||
|
Public License.
|
||||||
|
|
||||||
|
3. To the extent possible, the Licensor waives any right to
|
||||||
|
collect royalties from You for the exercise of the Licensed
|
||||||
|
Rights, whether directly or through a collecting society
|
||||||
|
under any voluntary or waivable statutory or compulsory
|
||||||
|
licensing scheme. In all other cases the Licensor expressly
|
||||||
|
reserves any right to collect such royalties, including when
|
||||||
|
the Licensed Material is used other than for NonCommercial
|
||||||
|
purposes.
|
||||||
|
|
||||||
|
|
||||||
|
Section 3 -- License Conditions.
|
||||||
|
|
||||||
|
Your exercise of the Licensed Rights is expressly made subject to the
|
||||||
|
following conditions.
|
||||||
|
|
||||||
|
a. Attribution.
|
||||||
|
|
||||||
|
1. If You Share the Licensed Material (including in modified
|
||||||
|
form), You must:
|
||||||
|
|
||||||
|
a. retain the following if it is supplied by the Licensor
|
||||||
|
with the Licensed Material:
|
||||||
|
|
||||||
|
i. identification of the creator(s) of the Licensed
|
||||||
|
Material and any others designated to receive
|
||||||
|
attribution, in any reasonable manner requested by
|
||||||
|
the Licensor (including by pseudonym if
|
||||||
|
designated);
|
||||||
|
|
||||||
|
ii. a copyright notice;
|
||||||
|
|
||||||
|
iii. a notice that refers to this Public License;
|
||||||
|
|
||||||
|
iv. a notice that refers to the disclaimer of
|
||||||
|
warranties;
|
||||||
|
|
||||||
|
v. a URI or hyperlink to the Licensed Material to the
|
||||||
|
extent reasonably practicable;
|
||||||
|
|
||||||
|
b. indicate if You modified the Licensed Material and
|
||||||
|
retain an indication of any previous modifications; and
|
||||||
|
|
||||||
|
c. indicate the Licensed Material is licensed under this
|
||||||
|
Public License, and include the text of, or the URI or
|
||||||
|
hyperlink to, this Public License.
|
||||||
|
|
||||||
|
2. You may satisfy the conditions in Section 3(a)(1) in any
|
||||||
|
reasonable manner based on the medium, means, and context in
|
||||||
|
which You Share the Licensed Material. For example, it may be
|
||||||
|
reasonable to satisfy the conditions by providing a URI or
|
||||||
|
hyperlink to a resource that includes the required
|
||||||
|
information.
|
||||||
|
3. If requested by the Licensor, You must remove any of the
|
||||||
|
information required by Section 3(a)(1)(A) to the extent
|
||||||
|
reasonably practicable.
|
||||||
|
|
||||||
|
b. ShareAlike.
|
||||||
|
|
||||||
|
In addition to the conditions in Section 3(a), if You Share
|
||||||
|
Adapted Material You produce, the following conditions also apply.
|
||||||
|
|
||||||
|
1. The Adapter's License You apply must be a Creative Commons
|
||||||
|
license with the same License Elements, this version or
|
||||||
|
later, or a BY-NC-SA Compatible License.
|
||||||
|
|
||||||
|
2. You must include the text of, or the URI or hyperlink to, the
|
||||||
|
Adapter's License You apply. You may satisfy this condition
|
||||||
|
in any reasonable manner based on the medium, means, and
|
||||||
|
context in which You Share Adapted Material.
|
||||||
|
|
||||||
|
3. You may not offer or impose any additional or different terms
|
||||||
|
or conditions on, or apply any Effective Technological
|
||||||
|
Measures to, Adapted Material that restrict exercise of the
|
||||||
|
rights granted under the Adapter's License You apply.
|
||||||
|
|
||||||
|
|
||||||
|
Section 4 -- Sui Generis Database Rights.
|
||||||
|
|
||||||
|
Where the Licensed Rights include Sui Generis Database Rights that
|
||||||
|
apply to Your use of the Licensed Material:
|
||||||
|
|
||||||
|
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
|
||||||
|
to extract, reuse, reproduce, and Share all or a substantial
|
||||||
|
portion of the contents of the database for NonCommercial purposes
|
||||||
|
only;
|
||||||
|
|
||||||
|
b. if You include all or a substantial portion of the database
|
||||||
|
contents in a database in which You have Sui Generis Database
|
||||||
|
Rights, then the database in which You have Sui Generis Database
|
||||||
|
Rights (but not its individual contents) is Adapted Material,
|
||||||
|
including for purposes of Section 3(b); and
|
||||||
|
|
||||||
|
c. You must comply with the conditions in Section 3(a) if You Share
|
||||||
|
all or a substantial portion of the contents of the database.
|
||||||
|
|
||||||
|
For the avoidance of doubt, this Section 4 supplements and does not
|
||||||
|
replace Your obligations under this Public License where the Licensed
|
||||||
|
Rights include other Copyright and Similar Rights.
|
||||||
|
|
||||||
|
|
||||||
|
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
|
||||||
|
|
||||||
|
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
|
||||||
|
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
|
||||||
|
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
||||||
|
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
|
||||||
|
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
|
||||||
|
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
||||||
|
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
|
||||||
|
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
|
||||||
|
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
|
||||||
|
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
|
||||||
|
|
||||||
|
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
|
||||||
|
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
|
||||||
|
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
|
||||||
|
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
|
||||||
|
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
|
||||||
|
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
|
||||||
|
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
|
||||||
|
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
|
||||||
|
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
|
||||||
|
|
||||||
|
c. The disclaimer of warranties and limitation of liability provided
|
||||||
|
above shall be interpreted in a manner that, to the extent
|
||||||
|
possible, most closely approximates an absolute disclaimer and
|
||||||
|
waiver of all liability.
|
||||||
|
|
||||||
|
|
||||||
|
Section 6 -- Term and Termination.
|
||||||
|
|
||||||
|
a. This Public License applies for the term of the Copyright and
|
||||||
|
Similar Rights licensed here. However, if You fail to comply with
|
||||||
|
this Public License, then Your rights under this Public License
|
||||||
|
terminate automatically.
|
||||||
|
|
||||||
|
b. Where Your right to use the Licensed Material has terminated under
|
||||||
|
Section 6(a), it reinstates:
|
||||||
|
|
||||||
|
1. automatically as of the date the violation is cured, provided
|
||||||
|
it is cured within 30 days of Your discovery of the
|
||||||
|
violation; or
|
||||||
|
|
||||||
|
2. upon express reinstatement by the Licensor.
|
||||||
|
|
||||||
|
For the avoidance of doubt, this Section 6(b) does not affect any
|
||||||
|
right the Licensor may have to seek remedies for Your violations
|
||||||
|
of this Public License.
|
||||||
|
|
||||||
|
c. For the avoidance of doubt, the Licensor may also offer the
|
||||||
|
Licensed Material under separate terms or conditions or stop
|
||||||
|
distributing the Licensed Material at any time; however, doing so
|
||||||
|
will not terminate this Public License.
|
||||||
|
|
||||||
|
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
|
||||||
|
License.
|
||||||
|
|
||||||
|
|
||||||
|
Section 7 -- Other Terms and Conditions.
|
||||||
|
|
||||||
|
a. The Licensor shall not be bound by any additional or different
|
||||||
|
terms or conditions communicated by You unless expressly agreed.
|
||||||
|
|
||||||
|
b. Any arrangements, understandings, or agreements regarding the
|
||||||
|
Licensed Material not stated herein are separate from and
|
||||||
|
independent of the terms and conditions of this Public License.
|
||||||
|
|
||||||
|
|
||||||
|
Section 8 -- Interpretation.
|
||||||
|
|
||||||
|
a. For the avoidance of doubt, this Public License does not, and
|
||||||
|
shall not be interpreted to, reduce, limit, restrict, or impose
|
||||||
|
conditions on any use of the Licensed Material that could lawfully
|
||||||
|
be made without permission under this Public License.
|
||||||
|
|
||||||
|
b. To the extent possible, if any provision of this Public License is
|
||||||
|
deemed unenforceable, it shall be automatically reformed to the
|
||||||
|
minimum extent necessary to make it enforceable. If the provision
|
||||||
|
cannot be reformed, it shall be severed from this Public License
|
||||||
|
without affecting the enforceability of the remaining terms and
|
||||||
|
conditions.
|
||||||
|
|
||||||
|
c. No term or condition of this Public License will be waived and no
|
||||||
|
failure to comply consented to unless expressly agreed to by the
|
||||||
|
Licensor.
|
||||||
|
|
||||||
|
d. Nothing in this Public License constitutes or may be interpreted
|
||||||
|
as a limitation upon, or waiver of, any privileges and immunities
|
||||||
|
that apply to the Licensor or You, including from the legal
|
||||||
|
processes of any jurisdiction or authority.
|
||||||
|
|
||||||
|
=======================================================================
|
||||||
|
|
||||||
|
Creative Commons is not a party to its public
|
||||||
|
licenses. Notwithstanding, Creative Commons may elect to apply one of
|
||||||
|
its public licenses to material it publishes and in those instances
|
||||||
|
will be considered the “Licensor.” The text of the Creative Commons
|
||||||
|
public licenses is dedicated to the public domain under the CC0 Public
|
||||||
|
Domain Dedication. Except for the limited purpose of indicating that
|
||||||
|
material is shared under a Creative Commons public license or as
|
||||||
|
otherwise permitted by the Creative Commons policies published at
|
||||||
|
creativecommons.org/policies, Creative Commons does not authorize the
|
||||||
|
use of the trademark "Creative Commons" or any other trademark or logo
|
||||||
|
of Creative Commons without its prior written consent including,
|
||||||
|
without limitation, in connection with any unauthorized modifications
|
||||||
|
to any of its public licenses or any other arrangements,
|
||||||
|
understandings, or agreements concerning use of licensed material. For
|
||||||
|
the avoidance of doubt, this paragraph does not form part of the
|
||||||
|
public licenses.
|
||||||
|
|
||||||
|
Creative Commons may be contacted at creativecommons.org.
|
||||||
38
ckpts/README.md
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
---
|
||||||
|
tags:
|
||||||
|
- robotics
|
||||||
|
---
|
||||||
|
|
||||||
|
# UnifoLM-WMA-0: A World-Model-Action (WMA) Framework under UnifoLM Family
|
||||||
|
<p style="font-size: 1.2em;">
|
||||||
|
<a href="https://unigen-x.github.io/unifolm-world-model-action.github.io"><strong>Project Page</strong></a> |
|
||||||
|
<a href="https://github.com/unitreerobotics/unifolm-world-model-action"><strong>Code</strong></a> |
|
||||||
|
<a href="https://huggingface.co/unitreerobotics/datasets"><strong>Dataset</strong></a>
|
||||||
|
</p>
|
||||||
|
<div align="center">
|
||||||
|
<div align="justify">
|
||||||
|
<b>UnifoLM-WMA-0</b> is Unitree‘s first open-source world-model–action architecture spanning multiple types of robotic embodiments, designed specifically for general-purpose robot learning. Its core component is a world-model capable of understanding the physical interactions between robots and the environments. This world-model provides two key functions: (a) <b>Simulation Engine</b> – operates as an interactive simulator to generate synthetic data for robot learning; (b) <b>Policy Enhancement</b> – connects with an action head and, by predicting future interaction processes with the world-model, further optimizes decision-making performance.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
## 🦾 Real Robot Deployment
|
||||||
|
| <img src="assets/real_z1_stackbox.gif" style="border:none;box-shadow:none;margin:0;padding:0;" /> | <img src="assets/real_dual_stackbox.gif" style="border:none;box-shadow:none;margin:0;padding:0;" /> |
|
||||||
|
|:---:|:---:|
|
||||||
|
| <img src="assets/real_cleanup_pencils.gif" style="border:none;box-shadow:none;margin:0;padding:0;" /> | <img src="assets/real_g1_pack_camera.gif" style="border:none;box-shadow:none;margin:0;padding:0;" /> |
|
||||||
|
|
||||||
|
**Note: the top-right window shows the world model’s prediction of future environmental changes.**
|
||||||
|
|
||||||
|
## License
|
||||||
|
The model is released under the CC BY-NC-SA 4.0 license as found in the [LICENSE](https://huggingface.co/unitreerobotics/UnifoLM-WMA-0/blob/main/LICENSE). You are responsible for ensuring that your use of Unitree AI Models complies with all applicable laws.
|
||||||
|
|
||||||
|
## Model Architecture
|
||||||
|

|
||||||
|
|
||||||
|
## Citation
|
||||||
|
```
|
||||||
|
@misc{unifolm-wma-0,
|
||||||
|
author = {Unitree},
|
||||||
|
title = {UnifoLM-WMA-0: A World-Model-Action (WMA) Framework under UnifoLM Family},
|
||||||
|
year = {2025},
|
||||||
|
}
|
||||||
|
```
|
||||||
BIN
ckpts/assets/real_cleanup_pencils.gif
Normal file
|
After Width: | Height: | Size: 22 MiB |
BIN
ckpts/assets/real_dual_stackbox.gif
Normal file
|
After Width: | Height: | Size: 28 MiB |
BIN
ckpts/assets/real_g1_pack_camera.gif
Normal file
|
After Width: | Height: | Size: 25 MiB |
BIN
ckpts/assets/real_z1_stackbox.gif
Normal file
|
After Width: | Height: | Size: 15 MiB |
BIN
ckpts/assets/world_model_interaction.gif
Normal file
|
After Width: | Height: | Size: 4.3 MiB |
@@ -222,7 +222,7 @@ data:
|
|||||||
test:
|
test:
|
||||||
target: unifolm_wma.data.wma_data.WMAData
|
target: unifolm_wma.data.wma_data.WMAData
|
||||||
params:
|
params:
|
||||||
data_dir: '/path/to/unifolm-world-model-action/examples/world_model_interaction_prompts'
|
data_dir: '/home/qhy/unifolm-world-model-action/examples/world_model_interaction_prompts'
|
||||||
video_length: ${model.params.wma_config.params.temporal_length}
|
video_length: ${model.params.wma_config.params.temporal_length}
|
||||||
frame_stride: 2
|
frame_stride: 2
|
||||||
load_raw_resolution: True
|
load_raw_resolution: True
|
||||||
|
|||||||
89
psnr_score_for_challenge.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import numpy as np
|
||||||
|
import json
|
||||||
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
||||||
|
from tqdm import tqdm
|
||||||
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||||
|
import PIL.Image
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_psnr(img1, img2):
|
||||||
|
mse = np.mean((img1.astype(np.float64) - img2.astype(np.float64)) ** 2)
|
||||||
|
if mse == 0:
|
||||||
|
return float('inf')
|
||||||
|
max_pixel = 255.0
|
||||||
|
psnr = 20 * np.log10(max_pixel / np.sqrt(mse))
|
||||||
|
return psnr
|
||||||
|
|
||||||
|
|
||||||
|
def process_video_psnr(gt_path, pred_path):
|
||||||
|
try:
|
||||||
|
clip_gt = VideoFileClip(gt_path)
|
||||||
|
clip_pred = VideoFileClip(pred_path)
|
||||||
|
|
||||||
|
fps = min(clip_gt.fps, clip_pred.fps)
|
||||||
|
duration = min(clip_gt.duration, clip_pred.duration)
|
||||||
|
|
||||||
|
time_points = np.arange(0, duration, 1.0 / fps)
|
||||||
|
|
||||||
|
video_psnrs = []
|
||||||
|
|
||||||
|
for t in time_points:
|
||||||
|
frame_gt = clip_gt.get_frame(t)
|
||||||
|
frame_pred = clip_pred.get_frame(t)
|
||||||
|
|
||||||
|
img_gt = PIL.Image.fromarray(frame_gt).resize((256, 256), PIL.Image.Resampling.BILINEAR)
|
||||||
|
img_pred = PIL.Image.fromarray(frame_pred).resize((256, 256), PIL.Image.Resampling.BILINEAR)
|
||||||
|
|
||||||
|
psnr = calculate_psnr(np.array(img_gt), np.array(img_pred))
|
||||||
|
video_psnrs.append(psnr)
|
||||||
|
|
||||||
|
clip_gt.close()
|
||||||
|
clip_pred.close()
|
||||||
|
|
||||||
|
return np.mean(video_psnrs) if video_psnrs else 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {os.path.basename(gt_path)}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
|
||||||
|
parser.add_argument('--gt_video', type=str, required=True, help='path to reference videos')
|
||||||
|
parser.add_argument('--pred_video', type=str, required=True, help='path to pred videos')
|
||||||
|
parser.add_argument('--output_file', type=str, default=None, help='path to output file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.exists(args.gt_video):
|
||||||
|
print(f"Error: GT video not found at {args.gt_video}")
|
||||||
|
return
|
||||||
|
if not os.path.exists(args.pred_video):
|
||||||
|
print(f"Error: Pred video not found at {args.pred_video}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Comparing:\nRef: {args.gt_video}\nPred: {args.pred_video}")
|
||||||
|
|
||||||
|
v_psnr = process_video_psnr(args.gt_video, args.pred_video)
|
||||||
|
|
||||||
|
if v_psnr is not None:
|
||||||
|
print("-" * 30)
|
||||||
|
print(f"Video PSNR: {v_psnr:.4f} dB")
|
||||||
|
print("-" * 30)
|
||||||
|
|
||||||
|
if args.output_file:
|
||||||
|
result = {
|
||||||
|
"gt_video": args.gt_video,
|
||||||
|
"pred_video": args.pred_video,
|
||||||
|
"psnr": v_psnr
|
||||||
|
}
|
||||||
|
with open(args.output_file, 'w') as f:
|
||||||
|
json.dump(result, f, indent=4)
|
||||||
|
print(f"Result saved to {args.output_file}")
|
||||||
|
else:
|
||||||
|
print("Failed to calculate PSNR.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -19,13 +19,13 @@ dependencies = [
|
|||||||
"pytorch-lightning==1.9.3",
|
"pytorch-lightning==1.9.3",
|
||||||
"pyyaml==6.0",
|
"pyyaml==6.0",
|
||||||
"setuptools==65.6.3",
|
"setuptools==65.6.3",
|
||||||
"torch==2.3.1",
|
#"torch==2.3.1",
|
||||||
"torchvision==0.18.1",
|
#"torchvision==0.18.1",
|
||||||
"tqdm==4.66.5",
|
"tqdm==4.66.5",
|
||||||
"transformers==4.40.1",
|
"transformers==4.40.1",
|
||||||
"moviepy==1.0.3",
|
"moviepy==1.0.3",
|
||||||
"av==12.3.0",
|
"av==12.3.0",
|
||||||
"xformers==0.0.27",
|
#"xformers==0.0.27",
|
||||||
"gradio==4.39.0",
|
"gradio==4.39.0",
|
||||||
"timm==0.9.10",
|
"timm==0.9.10",
|
||||||
"scikit-learn==1.5.1",
|
"scikit-learn==1.5.1",
|
||||||
|
|||||||
208
record.md
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
好,下面是完整的 decode 优化讲解。
|
||||||
|
|
||||||
|
---
|
||||||
|
背景:原始 decode 流程的问题
|
||||||
|
|
||||||
|
原始代码中,每轮迭代的 world model 交互会调用 image_guided_synthesis_sim_mode,内部对DDIM 采样得到的 latent 做全量 VAE decode:
|
||||||
|
|
||||||
|
# ddpms.py decode_core, perframe_ae=True
|
||||||
|
for index in range(z.shape[0]): # 逐帧循环,16帧就调16次
|
||||||
|
frame_z = 1. / self.scale_factor * z[index:index+1]
|
||||||
|
frame_result = self.first_stage_model.decode(frame_z)
|
||||||
|
results.append(frame_result)
|
||||||
|
|
||||||
|
假设 8 轮迭代、每轮 16 帧,WM 路径 decode 128 次,policy 路径再 decode 128 次,共 256 次 VAE decode。每次都是同步阻塞的。
|
||||||
|
|
||||||
|
decode 完的视频还会被逐轮写成 mp4 + tensorboard,产生大量磁盘 I/O。最后还要把所有轮的 decoded video 在内存中torch.cat
|
||||||
|
拼接,再写一次完整视频。
|
||||||
|
|
||||||
|
---
|
||||||
|
优化1:decode_video 开关——按需跳过 decode
|
||||||
|
|
||||||
|
文件: world_model_interaction.py函数 image_guided_synthesis_sim_mode
|
||||||
|
|
||||||
|
改动: 给函数加decode_video 参数(默认 False),返回值增加 raw samples:
|
||||||
|
|
||||||
|
def image_guided_synthesis_sim_mode(...,
|
||||||
|
decode_video: bool = False, # 新增
|
||||||
|
...) -> tuple[Tensor | None, Tensor, Tensor, Tensor | None]:
|
||||||
|
|
||||||
|
samples = None
|
||||||
|
if ddim_sampler is not None:
|
||||||
|
samples, actions, states, intermedia = ddim_sampler.sample(...)if decode_video: # 条件 decode
|
||||||
|
batch_images = model.decode_first_stage(samples)
|
||||||
|
batch_variants = batch_images
|
||||||
|
|
||||||
|
return batch_variants, actions, states, samples# 多返回 samples
|
||||||
|
|
||||||
|
调用侧:
|
||||||
|
- Policy 路径:由 CLI 参数 --fast_policy_no_decode 控制,只需要 action 时可跳过 decode
|
||||||
|
- WM 交互路径:传decode_video=False,只拿 raw latent
|
||||||
|
|
||||||
|
效果: WM 路径每轮省掉 16 帧全量 decode。
|
||||||
|
|
||||||
|
---
|
||||||
|
优化2:只decode observation 需要的帧
|
||||||
|
|
||||||
|
问题: WM 跳过了全量 decode,但下一轮的CLIP embedding 需要 pixel-space 图像做 observation。
|
||||||
|
|
||||||
|
改动: 只decode exe_steps 帧(通常 1帧),而不是全部 16 帧:
|
||||||
|
|
||||||
|
# WM 调用,不做全量 decode
|
||||||
|
pred_videos_1, _, pred_states, wm_samples = image_guided_synthesis_sim_mode(
|
||||||
|
..., decode_video=False)
|
||||||
|
|
||||||
|
# 只 decode exe_steps 帧给 observation
|
||||||
|
obs_pixels = model.decode_first_stage(
|
||||||
|
wm_samples[:, :, :args.exe_steps, :, :])
|
||||||
|
|
||||||
|
for idx in range(args.exe_steps):
|
||||||
|
observation = {
|
||||||
|
'observation.images.top':obs_pixels[0, :, idx:idx + 1].permute(1, 0, 2, 3),
|
||||||
|
...
|
||||||
|
}
|
||||||
|
cond_obs_queues = populate_queues(cond_obs_queues, observation)
|
||||||
|
|
||||||
|
关键细节: 必须逐帧填充 observation queue(idx:idx+1),不能全用最后一帧,否则 CLIP embedding 输入变了会影响精度。
|
||||||
|
|
||||||
|
效果: 每轮从 decode 16 帧降到 decode exe_steps 帧(省15 帧/轮)。
|
||||||
|
|
||||||
|
---
|
||||||
|
优化3:decode stream——GPU 上并行 decode 和 UNet
|
||||||
|
|
||||||
|
问题: 写入最终视频仍需要完整 segment 的 pixel,这部分 decode 还是要做。
|
||||||
|
|
||||||
|
思路: 用独立 CUDA stream 做 segment decode,和下一轮 UNet 推断在 GPU 上并行。
|
||||||
|
|
||||||
|
改动:
|
||||||
|
|
||||||
|
初始化:
|
||||||
|
decode_stream = torch.cuda.Stream(device=device)
|
||||||
|
pending_decode = None
|
||||||
|
|
||||||
|
循环尾部:
|
||||||
|
# 收集上一轮 decode 结果
|
||||||
|
if pending_decode is not None:
|
||||||
|
decode_stream.synchronize()
|
||||||
|
write_q.put(pending_decode.cpu())
|
||||||
|
pending_decode = None
|
||||||
|
|
||||||
|
# 在 decode stream 上启动当前轮 segment decode(不阻塞主线程)
|
||||||
|
latent_slice = wm_samples[:, :, :args.exe_steps]
|
||||||
|
decode_stream.wait_stream(torch.cuda.current_stream()) # 确保 latent 就绪
|
||||||
|
with torch.cuda.stream(decode_stream):
|
||||||
|
pending_decode = model.decode_first_stage(latent_slice)
|
||||||
|
# 主线程立即进入下一轮 UNet
|
||||||
|
|
||||||
|
循环结束后收集最后一轮:
|
||||||
|
if pending_decode is not None:
|
||||||
|
decode_stream.synchronize()
|
||||||
|
write_q.put(pending_decode.cpu())
|
||||||
|
|
||||||
|
原理: decode_stream.wait_stream() 建立 stream间依赖,确保 latent 产出后才开始 decode。两个 stream 的 kernel 可以被GPU
|
||||||
|
调度器交错执行。
|
||||||
|
|
||||||
|
效果: segment decode 时间被下一轮 UNet 推断掩盖。
|
||||||
|
|
||||||
|
---
|
||||||
|
优化4:Writer 进程——CPU 工作跨进程并行
|
||||||
|
|
||||||
|
问题: decode 完的tensor 需要转numpy + cv2 编码写盘,这是 CPU 密集型操作,Python GIL 限制线程并行。
|
||||||
|
|
||||||
|
改动:
|
||||||
|
|
||||||
|
辅助函数(主进程和子进程都能调用):
|
||||||
|
def _video_tensor_to_frames(video: Tensor) -> np.ndarray:
|
||||||
|
video = torch.clamp(video.float(), -1., 1.)
|
||||||
|
n = video.shape[0]
|
||||||
|
video = video.permute(2, 0, 1, 3, 4)
|
||||||
|
frame_grids = [
|
||||||
|
torchvision.utils.make_grid(f, nrow=int(n), padding=0) for f in video
|
||||||
|
]
|
||||||
|
grid = torch.stack(frame_grids, dim=0)
|
||||||
|
grid = ((grid + 1.0) / 2.0 * 255).to(torch.uint8).permute(0, 2, 3, 1)
|
||||||
|
return grid.numpy()[:, :, :, ::-1] # RGB → BGR
|
||||||
|
|
||||||
|
Writer 进程:
|
||||||
|
def _video_writer_process(q: mp.Queue, filename: str, fps: int):
|
||||||
|
vwriter = None
|
||||||
|
while True:
|
||||||
|
item = q.get()
|
||||||
|
if item is None: # sentinel,退出
|
||||||
|
break
|
||||||
|
frames = _video_tensor_to_frames(item)
|
||||||
|
if vwriter is None:
|
||||||
|
h, w = frames.shape[1], frames.shape[2]
|
||||||
|
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||||
|
vwriter = cv2.VideoWriter(filename, fourcc, fps, (w, h))
|
||||||
|
for f in frames:
|
||||||
|
vwriter.write(f)
|
||||||
|
if vwriter is not None:
|
||||||
|
vwriter.release()
|
||||||
|
|
||||||
|
主进程启动 writer:
|
||||||
|
write_q = mp.Queue()
|
||||||
|
writer_proc = mp.Process(target=_video_writer_process,
|
||||||
|
args=(write_q, sample_full_video_file, args.save_fps))
|
||||||
|
writer_proc.start()
|
||||||
|
|
||||||
|
主进程通过 write_q.put(tensor.cpu()) 发送数据,循环结束发None sentinel 并join()。
|
||||||
|
|
||||||
|
效果:
|
||||||
|
- tensor→numpy 转换和cv2 编码不占主进程 CPU 时间
|
||||||
|
- 不受 GIL 限制
|
||||||
|
- cv2.VideoWriter 增量写入,不攒内存,不做最终 torch.cat
|
||||||
|
- Queue 自带背压,writer 处理不过来时 put 自然阻塞,不会 OOM
|
||||||
|
|
||||||
|
---
|
||||||
|
同时删除的冗余 I/O
|
||||||
|
|
||||||
|
- 每轮迭代的 WM 中间tensorboard log(log_to_tensorboard_async)
|
||||||
|
- 每轮迭代的 WM 中间 mp4(save_results_async)
|
||||||
|
- 每轮迭代的 policy 中间 mp4
|
||||||
|
- 最终的 torch.cat + 一次性 save_results_async 写完整视频
|
||||||
|
|
||||||
|
---
|
||||||
|
总结:优化前后对比
|
||||||
|
|
||||||
|
┌─────────────────────┬────────────────────────┬───────────────────────────┐
|
||||||
|
│ │ 原始 │ 优化后 │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ WM decode帧数/轮 │ 16 │ exe_steps(通常1) │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ segment decode 阻塞 │ 同步阻塞主线程 │ decode stream,GPU并行 │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ tensor→numpy+写盘 │ 主线程同步/线程池(GIL) │ 独立进程,无GIL │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ 视频写入方式 │ 攒内存→cat→一次写 │ cv2增量追加写 │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ 中间I/O │ 每轮2×mp4+2×tb │ 无 │
|
||||||
|
├─────────────────────┼────────────────────────┼───────────────────────────┤
|
||||||
|
│ 内存占用 │ 随迭代数线性增长 │ 恒定(不攒decoded video) │
|
||||||
|
└─────────────────────┴────────────────────────┴───────────────────────────┘
|
||||||
|
|
||||||
|
✻ Cooked for 1m 31s
|
||||||
|
|
||||||
|
❯ 目前哪些优化是默认开启的
|
||||||
|
|
||||||
|
● 让我检查各个优化的开关状态。
|
||||||
|
|
||||||
|
● Searched for 2 patterns (ctrl+o to expand)
|
||||||
|
|
||||||
|
● 当前各优化的默认状态:
|
||||||
|
|
||||||
|
┌───────────────────────────────────┬────────────┬────────────────────────────────────────────────────────┐
|
||||||
|
│ 优化 │ 默认开启? │ 控制方式 │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ WM路径跳过全量decode │ ✅ 是 │ 硬编码 decode_video=False(789行) │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ 只decode exe_steps帧给observation │ ✅ 是 │ 无开关,直接生效(792行) │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ decode stream GPU并行 │ ✅ 是 │ 无开关,直接生效(824-827行) │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ Writer进程增量写盘 │ ✅ 是 │ 无开关,直接生效(672-675行) │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ 删除中间wm mp4/tb保存 │ ✅ 是 │ 代码已删除 │
|
||||||
|
├───────────────────────────────────┼────────────┼────────────────────────────────────────────────────────┤
|
||||||
|
│ Policy路径跳过decode │ ❌ 否 │ 需传--fast_policy_no_decode(默认False,即默认decode) │
|
||||||
|
└───────────────────────────────────┴────────────┴────────────────────────────────────────────────────────┘
|
||||||
114
run_all_case.sh
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# 自动执行所有场景的所有case
|
||||||
|
# 总共5个场景,每个场景4个case,共20个case
|
||||||
|
# 设置环境变量(离线模式)
|
||||||
|
export HF_HUB_OFFLINE=1
|
||||||
|
export TRANSFORMERS_OFFLINE=1
|
||||||
|
|
||||||
|
# 颜色定义
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# 定义所有场景
|
||||||
|
SCENARIOS=(
|
||||||
|
"unitree_g1_pack_camera"
|
||||||
|
"unitree_z1_dual_arm_cleanup_pencils"
|
||||||
|
"unitree_z1_dual_arm_stackbox"
|
||||||
|
"unitree_z1_dual_arm_stackbox_v2"
|
||||||
|
"unitree_z1_stackbox"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 定义case数量
|
||||||
|
CASES=(1 2 3 4)
|
||||||
|
|
||||||
|
# 记录开始时间
|
||||||
|
START_TIME=$(date +%s)
|
||||||
|
LOG_FILE="run_all_cases_$(date +%Y%m%d_%H%M%S).log"
|
||||||
|
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo -e "${BLUE}开始执行所有场景的case${NC}"
|
||||||
|
echo -e "${BLUE}总共: ${#SCENARIOS[@]} 个场景 x ${#CASES[@]} 个case = $((${#SCENARIOS[@]} * ${#CASES[@]})) 个任务${NC}"
|
||||||
|
echo -e "${BLUE}日志文件: ${LOG_FILE}${NC}"
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 初始化计数器
|
||||||
|
TOTAL_CASES=$((${#SCENARIOS[@]} * ${#CASES[@]}))
|
||||||
|
CURRENT_CASE=0
|
||||||
|
SUCCESS_COUNT=0
|
||||||
|
FAIL_COUNT=0
|
||||||
|
|
||||||
|
# 记录失败的case
|
||||||
|
declare -a FAILED_CASES
|
||||||
|
|
||||||
|
# 遍历所有场景
|
||||||
|
for scenario in "${SCENARIOS[@]}"; do
|
||||||
|
echo -e "${YELLOW}>>> 场景: ${scenario}${NC}"
|
||||||
|
|
||||||
|
# 遍历所有case
|
||||||
|
for case_num in "${CASES[@]}"; do
|
||||||
|
CURRENT_CASE=$((CURRENT_CASE + 1))
|
||||||
|
case_dir="${scenario}/case${case_num}"
|
||||||
|
script_path="${case_dir}/run_world_model_interaction.sh"
|
||||||
|
|
||||||
|
echo -e "${BLUE}[${CURRENT_CASE}/${TOTAL_CASES}] 执行: ${case_dir}${NC}"
|
||||||
|
|
||||||
|
# 检查脚本是否存在
|
||||||
|
if [ ! -f "${script_path}" ]; then
|
||||||
|
echo -e "${RED}错误: 脚本不存在 ${script_path}${NC}"
|
||||||
|
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||||
|
FAILED_CASES+=("${case_dir} (脚本不存在)")
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 执行脚本
|
||||||
|
echo "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||||
|
|
||||||
|
if bash "${script_path}" >> "${LOG_FILE}" 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ 成功: ${case_dir}${NC}"
|
||||||
|
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ 失败: ${case_dir}${NC}"
|
||||||
|
FAIL_COUNT=$((FAIL_COUNT + 1))
|
||||||
|
FAILED_CASES+=("${case_dir}")
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "结束时间: $(date '+%Y-%m-%d %H:%M:%S')"
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
|
||||||
|
# 计算总耗时
|
||||||
|
END_TIME=$(date +%s)
|
||||||
|
DURATION=$((END_TIME - START_TIME))
|
||||||
|
HOURS=$((DURATION / 3600))
|
||||||
|
MINUTES=$(((DURATION % 3600) / 60))
|
||||||
|
SECONDS=$((DURATION % 60))
|
||||||
|
|
||||||
|
# 输出总结
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo -e "${BLUE}执行完成!${NC}"
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
|
echo -e "总任务数: ${TOTAL_CASES}"
|
||||||
|
echo -e "${GREEN}成功: ${SUCCESS_COUNT}${NC}"
|
||||||
|
echo -e "${RED}失败: ${FAIL_COUNT}${NC}"
|
||||||
|
echo -e "总耗时: ${HOURS}小时 ${MINUTES}分钟 ${SECONDS}秒"
|
||||||
|
echo -e "详细日志: ${LOG_FILE}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# 如果有失败的case,列出来
|
||||||
|
if [ ${FAIL_COUNT} -gt 0 ]; then
|
||||||
|
echo -e "${RED}失败的case列表:${NC}"
|
||||||
|
for failed_case in "${FAILED_CASES[@]}"; do
|
||||||
|
echo -e "${RED} - ${failed_case}${NC}"
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "${BLUE}========================================${NC}"
|
||||||
504
run_all_cases_20260218_190150.log
Normal file
@@ -0,0 +1,504 @@
|
|||||||
|
2026-02-18 19:01:56.891895: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:01:56.940243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:01:56.940285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:01:56.941395: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:01:56.948327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:01:57.870809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:02:10] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:17<02:51, 17.15s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
|
18%|█▊ | 2/11 [00:33<02:31, 16.87s/it]
|
||||||
|
27%|██▋ | 3/11 [00:50<02:14, 16.76s/it]
|
||||||
|
36%|███▋ | 4/11 [01:07<01:57, 16.81s/it]
|
||||||
|
45%|████▌ | 5/11 [01:24<01:41, 16.85s/it]
|
||||||
|
55%|█████▍ | 6/11 [01:41<01:24, 16.82s/it]
|
||||||
|
64%|██████▎ | 7/11 [01:57<01:07, 16.82s/it]
|
||||||
|
73%|███████▎ | 8/11 [02:14<00:50, 16.83s/it]
|
||||||
|
82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
|
||||||
|
91%|█████████ | 10/11 [02:48<00:16, 16.81s/it]
|
||||||
|
100%|██████████| 11/11 [03:05<00:00, 16.81s/it]
|
||||||
|
100%|██████████| 11/11 [03:05<00:00, 16.83s/it]
|
||||||
|
>>> Step 1: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 2: generating actions ...
|
||||||
|
>>> Step 2: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 3: generating actions ...
|
||||||
|
>>> Step 3: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 4: generating actions ...
|
||||||
|
>>> Step 4: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 5: generating actions ...
|
||||||
|
>>> Step 5: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 6: generating actions ...
|
||||||
|
>>> Step 6: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 7: generating actions ...
|
||||||
|
>>> Step 7: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 8: generating actions ...
|
||||||
|
>>> Step 8: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 9: generating actions ...
|
||||||
|
>>> Step 9: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 10: generating actions ...
|
||||||
|
>>> Step 10: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
|
||||||
|
real 3m49.072s
|
||||||
|
user 4m16.055s
|
||||||
|
sys 0m44.636s
|
||||||
|
2026-02-18 19:05:45.956647: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:05:46.004149: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:05:46.004193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:05:46.005265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:05:46.012074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:05:46.932966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:05:59] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:16<02:47, 16.71s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
|
18%|█▊ | 2/11 [00:33<02:30, 16.75s/it]
|
||||||
|
27%|██▋ | 3/11 [00:50<02:15, 16.91s/it]
|
||||||
|
36%|███▋ | 4/11 [01:07<01:59, 17.02s/it]
|
||||||
|
45%|████▌ | 5/11 [01:24<01:41, 16.98s/it]
|
||||||
|
55%|█████▍ | 6/11 [01:41<01:24, 16.94s/it]
|
||||||
|
64%|██████▎ | 7/11 [01:58<01:07, 16.90s/it]
|
||||||
|
73%|███████▎ | 8/11 [02:15<00:50, 16.83s/it]
|
||||||
|
82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
|
||||||
|
91%|█████████ | 10/11 [02:49<00:16, 16.94s/it]
|
||||||
|
100%|██████████| 11/11 [03:06<00:00, 16.97s/it]
|
||||||
|
100%|██████████| 11/11 [03:06<00:00, 16.91s/it]
|
||||||
|
>>> Step 1: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 2: generating actions ...
|
||||||
|
>>> Step 2: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 3: generating actions ...
|
||||||
|
>>> Step 3: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 4: generating actions ...
|
||||||
|
>>> Step 4: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 5: generating actions ...
|
||||||
|
>>> Step 5: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 6: generating actions ...
|
||||||
|
>>> Step 6: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 7: generating actions ...
|
||||||
|
>>> Step 7: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 8: generating actions ...
|
||||||
|
>>> Step 8: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 9: generating actions ...
|
||||||
|
>>> Step 9: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 10: generating actions ...
|
||||||
|
>>> Step 10: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
|
||||||
|
real 3m49.162s
|
||||||
|
user 4m12.814s
|
||||||
|
sys 0m45.565s
|
||||||
|
2026-02-18 19:09:35.113634: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:09:35.161428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:09:35.161474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:09:35.162551: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:09:35.169325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:09:36.089250: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:09:49] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:16<02:45, 16.53s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
61
run_all_psnr.sh
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
|
||||||
|
SCENARIOS=(
|
||||||
|
unitree_g1_pack_camera
|
||||||
|
unitree_z1_dual_arm_cleanup_pencils
|
||||||
|
unitree_z1_dual_arm_stackbox
|
||||||
|
unitree_z1_dual_arm_stackbox_v2
|
||||||
|
unitree_z1_stackbox
|
||||||
|
)
|
||||||
|
|
||||||
|
CASES=(case1 case2 case3 case4)
|
||||||
|
|
||||||
|
total=0
|
||||||
|
success=0
|
||||||
|
fail=0
|
||||||
|
|
||||||
|
for scenario in "${SCENARIOS[@]}"; do
|
||||||
|
for case in "${CASES[@]}"; do
|
||||||
|
case_dir="${scenario}/${case}"
|
||||||
|
gt_video="${case_dir}/${scenario}_${case}.mp4"
|
||||||
|
pred_video=$(ls "${case_dir}"/output/inference/*_full_fs*.mp4 2>/dev/null | head -1)
|
||||||
|
output_file="${case_dir}/psnr_result.json"
|
||||||
|
|
||||||
|
total=$((total + 1))
|
||||||
|
echo "=========================================="
|
||||||
|
echo "[${total}/20] ${case_dir}"
|
||||||
|
|
||||||
|
if [ ! -f "$gt_video" ]; then
|
||||||
|
echo " SKIP: GT video not found: $gt_video"
|
||||||
|
fail=$((fail + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "$pred_video" ]; then
|
||||||
|
echo " SKIP: pred video not found in ${case_dir}/output/inference/"
|
||||||
|
fail=$((fail + 1))
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo " GT: $gt_video"
|
||||||
|
echo " Pred: $pred_video"
|
||||||
|
echo " Out: $output_file"
|
||||||
|
|
||||||
|
if python3 psnr_score_for_challenge.py \
|
||||||
|
--gt_video "$gt_video" \
|
||||||
|
--pred_video "$pred_video" \
|
||||||
|
--output_file "$output_file"; then
|
||||||
|
success=$((success + 1))
|
||||||
|
echo " DONE"
|
||||||
|
else
|
||||||
|
fail=$((fail + 1))
|
||||||
|
echo " FAILED"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo "Finished: ${success} success, ${fail} fail, ${total} total"
|
||||||
@@ -16,6 +16,9 @@ from collections import OrderedDict
|
|||||||
from unifolm_wma.models.samplers.ddim import DDIMSampler
|
from unifolm_wma.models.samplers.ddim import DDIMSampler
|
||||||
from unifolm_wma.utils.utils import instantiate_from_config
|
from unifolm_wma.utils.utils import instantiate_from_config
|
||||||
|
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
|
||||||
|
|
||||||
def get_filelist(data_dir: str, postfixes: list[str]) -> list[str]:
|
def get_filelist(data_dir: str, postfixes: list[str]) -> list[str]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ from fastapi.responses import JSONResponse
|
|||||||
from typing import Any, Dict, Optional, Tuple, List
|
from typing import Any, Dict, Optional, Tuple, List
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
|
||||||
from unifolm_wma.utils.utils import instantiate_from_config
|
from unifolm_wma.utils.utils import instantiate_from_config
|
||||||
from unifolm_wma.models.samplers.ddim import DDIMSampler
|
from unifolm_wma.models.samplers.ddim import DDIMSampler
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import logging
|
|||||||
import einops
|
import einops
|
||||||
import warnings
|
import warnings
|
||||||
import imageio
|
import imageio
|
||||||
|
import atexit
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from pytorch_lightning import seed_everything
|
from pytorch_lightning import seed_everything
|
||||||
from omegaconf import OmegaConf
|
from omegaconf import OmegaConf
|
||||||
@@ -16,8 +18,12 @@ from tqdm import tqdm
|
|||||||
from einops import rearrange, repeat
|
from einops import rearrange, repeat
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from eval_utils import populate_queues, log_to_tensorboard
|
from eval_utils import populate_queues
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
from typing import Optional, List, Any
|
||||||
|
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@@ -150,6 +156,81 @@ def save_results(video: Tensor, filename: str, fps: int = 8) -> None:
|
|||||||
options={'crf': '10'})
|
options={'crf': '10'})
|
||||||
|
|
||||||
|
|
||||||
|
# ========== Async I/O ==========
|
||||||
|
_io_executor: Optional[ThreadPoolExecutor] = None
|
||||||
|
_io_futures: List[Any] = []
|
||||||
|
|
||||||
|
|
||||||
|
def _get_io_executor() -> ThreadPoolExecutor:
|
||||||
|
global _io_executor
|
||||||
|
if _io_executor is None:
|
||||||
|
_io_executor = ThreadPoolExecutor(max_workers=2)
|
||||||
|
return _io_executor
|
||||||
|
|
||||||
|
|
||||||
|
def _flush_io():
|
||||||
|
"""Wait for all pending async I/O to finish."""
|
||||||
|
global _io_futures
|
||||||
|
for fut in _io_futures:
|
||||||
|
try:
|
||||||
|
fut.result()
|
||||||
|
except Exception as e:
|
||||||
|
print(f">>> [async I/O] error: {e}")
|
||||||
|
_io_futures.clear()
|
||||||
|
|
||||||
|
|
||||||
|
atexit.register(_flush_io)
|
||||||
|
|
||||||
|
|
||||||
|
def _save_results_sync(video_cpu: Tensor, filename: str, fps: int) -> None:
|
||||||
|
"""Synchronous save on CPU tensor (runs in background thread)."""
|
||||||
|
video = torch.clamp(video_cpu.float(), -1., 1.)
|
||||||
|
n = video.shape[0]
|
||||||
|
video = video.permute(2, 0, 1, 3, 4)
|
||||||
|
frame_grids = [
|
||||||
|
torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
|
||||||
|
for framesheet in video
|
||||||
|
]
|
||||||
|
grid = torch.stack(frame_grids, dim=0)
|
||||||
|
grid = (grid + 1.0) / 2.0
|
||||||
|
grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
|
||||||
|
torchvision.io.write_video(filename,
|
||||||
|
grid,
|
||||||
|
fps=fps,
|
||||||
|
video_codec='h264',
|
||||||
|
options={'crf': '10'})
|
||||||
|
|
||||||
|
|
||||||
|
def save_results_async(video: Tensor, filename: str, fps: int = 8) -> None:
|
||||||
|
"""Submit video saving to background thread pool."""
|
||||||
|
video_cpu = video.detach().cpu()
|
||||||
|
fut = _get_io_executor().submit(_save_results_sync, video_cpu, filename, fps)
|
||||||
|
_io_futures.append(fut)
|
||||||
|
|
||||||
|
|
||||||
|
def _log_to_tb_sync(writer, video_cpu: Tensor, tag: str, fps: int) -> None:
|
||||||
|
"""Synchronous TensorBoard log on CPU tensor (runs in background thread)."""
|
||||||
|
if video_cpu.dim() == 5:
|
||||||
|
n = video_cpu.shape[0]
|
||||||
|
video = video_cpu.permute(2, 0, 1, 3, 4)
|
||||||
|
frame_grids = [
|
||||||
|
torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
|
||||||
|
for framesheet in video
|
||||||
|
]
|
||||||
|
grid = torch.stack(frame_grids, dim=0)
|
||||||
|
grid = (grid + 1.0) / 2.0
|
||||||
|
grid = grid.unsqueeze(dim=0)
|
||||||
|
writer.add_video(tag, grid, fps=fps)
|
||||||
|
|
||||||
|
|
||||||
|
def log_to_tensorboard_async(writer, data: Tensor, tag: str, fps: int = 10) -> None:
|
||||||
|
"""Submit TensorBoard logging to background thread pool."""
|
||||||
|
if isinstance(data, torch.Tensor) and data.dim() == 5:
|
||||||
|
data_cpu = data.detach().cpu()
|
||||||
|
fut = _get_io_executor().submit(_log_to_tb_sync, writer, data_cpu, tag, fps)
|
||||||
|
_io_futures.append(fut)
|
||||||
|
|
||||||
|
|
||||||
def get_init_frame_path(data_dir: str, sample: dict) -> str:
|
def get_init_frame_path(data_dir: str, sample: dict) -> str:
|
||||||
"""Construct the init_frame path from directory and sample metadata.
|
"""Construct the init_frame path from directory and sample metadata.
|
||||||
|
|
||||||
@@ -327,7 +408,8 @@ def image_guided_synthesis_sim_mode(
|
|||||||
timestep_spacing: str = 'uniform',
|
timestep_spacing: str = 'uniform',
|
||||||
guidance_rescale: float = 0.0,
|
guidance_rescale: float = 0.0,
|
||||||
sim_mode: bool = True,
|
sim_mode: bool = True,
|
||||||
**kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
decode_video: bool = True,
|
||||||
|
**kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
|
||||||
"""
|
"""
|
||||||
Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text).
|
Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text).
|
||||||
|
|
||||||
@@ -350,10 +432,13 @@ def image_guided_synthesis_sim_mode(
|
|||||||
timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace".
|
timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace".
|
||||||
guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance.
|
guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance.
|
||||||
sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model.
|
sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model.
|
||||||
|
decode_video (bool): Whether to decode latent samples to pixel-space video.
|
||||||
|
Set to False to skip VAE decode for speed when only actions/states are needed.
|
||||||
**kwargs: Additional arguments passed to the DDIM sampler.
|
**kwargs: Additional arguments passed to the DDIM sampler.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W].
|
batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W],
|
||||||
|
or None when decode_video=False.
|
||||||
actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding.
|
actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding.
|
||||||
states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding.
|
states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding.
|
||||||
"""
|
"""
|
||||||
@@ -406,6 +491,7 @@ def image_guided_synthesis_sim_mode(
|
|||||||
kwargs.update({"unconditional_conditioning_img_nonetext": None})
|
kwargs.update({"unconditional_conditioning_img_nonetext": None})
|
||||||
cond_mask = None
|
cond_mask = None
|
||||||
cond_z0 = None
|
cond_z0 = None
|
||||||
|
batch_variants = None
|
||||||
if ddim_sampler is not None:
|
if ddim_sampler is not None:
|
||||||
samples, actions, states, intermedia = ddim_sampler.sample(
|
samples, actions, states, intermedia = ddim_sampler.sample(
|
||||||
S=ddim_steps,
|
S=ddim_steps,
|
||||||
@@ -424,9 +510,10 @@ def image_guided_synthesis_sim_mode(
|
|||||||
guidance_rescale=guidance_rescale,
|
guidance_rescale=guidance_rescale,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
# Reconstruct from latent to pixel space
|
if decode_video:
|
||||||
batch_images = model.decode_first_stage(samples)
|
# Reconstruct from latent to pixel space
|
||||||
batch_variants = batch_images
|
batch_images = model.decode_first_stage(samples)
|
||||||
|
batch_variants = batch_images
|
||||||
|
|
||||||
return batch_variants, actions, states
|
return batch_variants, actions, states
|
||||||
|
|
||||||
@@ -453,26 +540,56 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
|
|||||||
csv_path = os.path.join(args.prompt_dir, f"{args.dataset}.csv")
|
csv_path = os.path.join(args.prompt_dir, f"{args.dataset}.csv")
|
||||||
df = pd.read_csv(csv_path)
|
df = pd.read_csv(csv_path)
|
||||||
|
|
||||||
# Load config
|
# Load config (always needed for data setup)
|
||||||
config = OmegaConf.load(args.config)
|
config = OmegaConf.load(args.config)
|
||||||
config['model']['params']['wma_config']['params'][
|
|
||||||
'use_checkpoint'] = False
|
|
||||||
model = instantiate_from_config(config.model)
|
|
||||||
model.perframe_ae = args.perframe_ae
|
|
||||||
assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
|
|
||||||
model = load_model_checkpoint(model, args.ckpt_path)
|
|
||||||
model.eval()
|
|
||||||
print(f'>>> Load pre-trained model ...')
|
|
||||||
|
|
||||||
# Build unnomalizer
|
prepared_path = args.ckpt_path + ".prepared.pt"
|
||||||
|
if os.path.exists(prepared_path):
|
||||||
|
# ---- Fast path: load the fully-prepared model ----
|
||||||
|
print(f">>> Loading prepared model from {prepared_path} ...")
|
||||||
|
model = torch.load(prepared_path,
|
||||||
|
map_location=f"cuda:{gpu_no}",
|
||||||
|
weights_only=False,
|
||||||
|
mmap=True)
|
||||||
|
model.eval()
|
||||||
|
print(f">>> Prepared model loaded.")
|
||||||
|
else:
|
||||||
|
# ---- Normal path: construct + load checkpoint ----
|
||||||
|
config['model']['params']['wma_config']['params'][
|
||||||
|
'use_checkpoint'] = False
|
||||||
|
model = instantiate_from_config(config.model)
|
||||||
|
model.perframe_ae = args.perframe_ae
|
||||||
|
|
||||||
|
assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
|
||||||
|
model = load_model_checkpoint(model, args.ckpt_path)
|
||||||
|
model.eval()
|
||||||
|
model = model.cuda(gpu_no)
|
||||||
|
print(f'>>> Load pre-trained model ...')
|
||||||
|
|
||||||
|
# Save prepared model for fast loading next time
|
||||||
|
print(f">>> Saving prepared model to {prepared_path} ...")
|
||||||
|
torch.save(model, prepared_path)
|
||||||
|
print(f">>> Prepared model saved ({os.path.getsize(prepared_path) / 1024**3:.1f} GB).")
|
||||||
|
|
||||||
|
# Build normalizer (always needed, independent of model loading path)
|
||||||
logging.info("***** Configing Data *****")
|
logging.info("***** Configing Data *****")
|
||||||
data = instantiate_from_config(config.data)
|
data = instantiate_from_config(config.data)
|
||||||
data.setup()
|
data.setup()
|
||||||
print(">>> Dataset is successfully loaded ...")
|
print(">>> Dataset is successfully loaded ...")
|
||||||
|
|
||||||
model = model.cuda(gpu_no)
|
|
||||||
device = get_device_from_parameters(model)
|
device = get_device_from_parameters(model)
|
||||||
|
|
||||||
|
# Fuse KV projections in attention layers (to_k + to_v → to_kv)
|
||||||
|
from unifolm_wma.modules.attention import CrossAttention
|
||||||
|
kv_count = sum(1 for m in model.modules()
|
||||||
|
if isinstance(m, CrossAttention) and m.fuse_kv())
|
||||||
|
print(f" ✓ KV fused: {kv_count} attention layers")
|
||||||
|
|
||||||
|
# Load TRT backbone if engine exists
|
||||||
|
trt_engine_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', 'trt_engines', 'video_backbone.engine')
|
||||||
|
if os.path.exists(trt_engine_path):
|
||||||
|
model.model.diffusion_model.load_trt_backbone(trt_engine_path)
|
||||||
|
|
||||||
# Run over data
|
# Run over data
|
||||||
assert (args.height % 16 == 0) and (
|
assert (args.height % 16 == 0) and (
|
||||||
args.width % 16
|
args.width % 16
|
||||||
@@ -587,7 +704,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
|
|||||||
fs=model_input_fs,
|
fs=model_input_fs,
|
||||||
timestep_spacing=args.timestep_spacing,
|
timestep_spacing=args.timestep_spacing,
|
||||||
guidance_rescale=args.guidance_rescale,
|
guidance_rescale=args.guidance_rescale,
|
||||||
sim_mode=False)
|
sim_mode=False,
|
||||||
|
decode_video=not args.fast_policy_no_decode)
|
||||||
|
|
||||||
# Update future actions in the observation queues
|
# Update future actions in the observation queues
|
||||||
for idx in range(len(pred_actions[0])):
|
for idx in range(len(pred_actions[0])):
|
||||||
@@ -644,29 +762,31 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
|
|||||||
cond_obs_queues = populate_queues(cond_obs_queues,
|
cond_obs_queues = populate_queues(cond_obs_queues,
|
||||||
observation)
|
observation)
|
||||||
|
|
||||||
# Save the imagen videos for decision-making
|
# Save the imagen videos for decision-making (async)
|
||||||
sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
|
if pred_videos_0 is not None:
|
||||||
log_to_tensorboard(writer,
|
sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
|
||||||
pred_videos_0,
|
log_to_tensorboard_async(writer,
|
||||||
sample_tag,
|
pred_videos_0,
|
||||||
fps=args.save_fps)
|
sample_tag,
|
||||||
|
fps=args.save_fps)
|
||||||
# Save videos environment changes via world-model interaction
|
# Save videos environment changes via world-model interaction
|
||||||
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
|
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
|
||||||
log_to_tensorboard(writer,
|
log_to_tensorboard_async(writer,
|
||||||
pred_videos_1,
|
pred_videos_1,
|
||||||
sample_tag,
|
sample_tag,
|
||||||
fps=args.save_fps)
|
fps=args.save_fps)
|
||||||
|
|
||||||
# Save the imagen videos for decision-making
|
# Save the imagen videos for decision-making
|
||||||
sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
|
if pred_videos_0 is not None:
|
||||||
save_results(pred_videos_0.cpu(),
|
sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
|
||||||
sample_video_file,
|
save_results_async(pred_videos_0,
|
||||||
fps=args.save_fps)
|
sample_video_file,
|
||||||
|
fps=args.save_fps)
|
||||||
# Save videos environment changes via world-model interaction
|
# Save videos environment changes via world-model interaction
|
||||||
sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
|
sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
|
||||||
save_results(pred_videos_1.cpu(),
|
save_results_async(pred_videos_1,
|
||||||
sample_video_file,
|
sample_video_file,
|
||||||
fps=args.save_fps)
|
fps=args.save_fps)
|
||||||
|
|
||||||
print('>' * 24)
|
print('>' * 24)
|
||||||
# Collect the result of world-model interactions
|
# Collect the result of world-model interactions
|
||||||
@@ -674,12 +794,15 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
|
|||||||
|
|
||||||
full_video = torch.cat(wm_video, dim=2)
|
full_video = torch.cat(wm_video, dim=2)
|
||||||
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
|
sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
|
||||||
log_to_tensorboard(writer,
|
log_to_tensorboard_async(writer,
|
||||||
full_video,
|
full_video,
|
||||||
sample_tag,
|
sample_tag,
|
||||||
fps=args.save_fps)
|
fps=args.save_fps)
|
||||||
sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4"
|
sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4"
|
||||||
save_results(full_video, sample_full_video_file, fps=args.save_fps)
|
save_results_async(full_video, sample_full_video_file, fps=args.save_fps)
|
||||||
|
|
||||||
|
# Wait for all async I/O to complete
|
||||||
|
_flush_io()
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@@ -794,6 +917,11 @@ def get_parser():
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="not using the predicted states as comparison")
|
help="not using the predicted states as comparison")
|
||||||
|
parser.add_argument(
|
||||||
|
"--fast_policy_no_decode",
|
||||||
|
action='store_true',
|
||||||
|
default=False,
|
||||||
|
help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.")
|
||||||
parser.add_argument("--save_fps",
|
parser.add_argument("--save_fps",
|
||||||
type=int,
|
type=int,
|
||||||
default=8,
|
default=8,
|
||||||
|
|||||||
87
scripts/export_trt.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
"""Export video UNet backbone to ONNX, then convert to TensorRT engine.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/export_trt.py \
|
||||||
|
--ckpt ckpts/unifolm_wma_dual.ckpt.prepared.pt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--out_dir trt_engines
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import tensorrt as trt
|
||||||
|
from omegaconf import OmegaConf
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||||
|
from unifolm_wma.utils.utils import instantiate_from_config
|
||||||
|
from unifolm_wma.trt_utils import export_backbone_onnx
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(config_path, ckpt_path):
|
||||||
|
if ckpt_path.endswith('.prepared.pt'):
|
||||||
|
model = torch.load(ckpt_path, map_location='cpu')
|
||||||
|
else:
|
||||||
|
config = OmegaConf.load(config_path)
|
||||||
|
model = instantiate_from_config(config.model)
|
||||||
|
state_dict = torch.load(ckpt_path, map_location='cpu')
|
||||||
|
if 'state_dict' in state_dict:
|
||||||
|
state_dict = state_dict['state_dict']
|
||||||
|
model.load_state_dict(state_dict, strict=False)
|
||||||
|
model.eval().cuda()
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--ckpt', required=True)
|
||||||
|
parser.add_argument('--config', default='configs/inference/world_model_interaction.yaml')
|
||||||
|
parser.add_argument('--out_dir', default='trt_engines')
|
||||||
|
parser.add_argument('--context_len', type=int, default=95)
|
||||||
|
parser.add_argument('--fp16', action='store_true', default=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.makedirs(args.out_dir, exist_ok=True)
|
||||||
|
onnx_path = os.path.join(args.out_dir, 'video_backbone.onnx')
|
||||||
|
engine_path = os.path.join(args.out_dir, 'video_backbone.engine')
|
||||||
|
|
||||||
|
if os.path.exists(onnx_path):
|
||||||
|
print(f">>> ONNX already exists at {onnx_path}, skipping export.")
|
||||||
|
n_outputs = 10
|
||||||
|
else:
|
||||||
|
print(">>> Loading model ...")
|
||||||
|
model = load_model(args.config, args.ckpt)
|
||||||
|
print(">>> Exporting ONNX ...")
|
||||||
|
with torch.no_grad():
|
||||||
|
n_outputs = export_backbone_onnx(model, onnx_path, context_len=args.context_len)
|
||||||
|
del model
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
print(">>> Converting ONNX -> TensorRT engine ...")
|
||||||
|
logger = trt.Logger(trt.Logger.WARNING)
|
||||||
|
builder = trt.Builder(logger)
|
||||||
|
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
|
||||||
|
parser = trt.OnnxParser(network, logger)
|
||||||
|
|
||||||
|
if not parser.parse_from_file(os.path.abspath(onnx_path)):
|
||||||
|
for i in range(parser.num_errors):
|
||||||
|
print(f" ONNX parse error: {parser.get_error(i)}")
|
||||||
|
raise RuntimeError("ONNX parsing failed")
|
||||||
|
|
||||||
|
config = builder.create_builder_config()
|
||||||
|
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 16 << 30)
|
||||||
|
if args.fp16:
|
||||||
|
config.set_flag(trt.BuilderFlag.FP16)
|
||||||
|
|
||||||
|
engine_bytes = builder.build_serialized_network(network, config)
|
||||||
|
with open(engine_path, 'wb') as f:
|
||||||
|
f.write(engine_bytes)
|
||||||
|
|
||||||
|
print(f"\n>>> Done! Engine saved to {engine_path}")
|
||||||
|
print(f" Outputs: 1 y + {n_outputs - 1} hs_a tensors")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -11,6 +11,9 @@ from unifolm_wma.utils.utils import instantiate_from_config
|
|||||||
from unifolm_wma.utils.train import get_trainer_callbacks, get_trainer_logger, get_trainer_strategy
|
from unifolm_wma.utils.train import get_trainer_callbacks, get_trainer_logger, get_trainer_strategy
|
||||||
from unifolm_wma.utils.train import set_logger, init_workspace, load_checkpoints, get_num_parameters
|
from unifolm_wma.utils.train import set_logger, init_workspace, load_checkpoints, get_num_parameters
|
||||||
|
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
|
||||||
|
|
||||||
def get_parser(**parser_kwargs):
|
def get_parser(**parser_kwargs):
|
||||||
parser = argparse.ArgumentParser(**parser_kwargs)
|
parser = argparse.ArgumentParser(**parser_kwargs)
|
||||||
|
|||||||
@@ -501,6 +501,10 @@ class ConditionalUnet1D(nn.Module):
|
|||||||
self.last_frame_only = last_frame_only
|
self.last_frame_only = last_frame_only
|
||||||
self.horizon = horizon
|
self.horizon = horizon
|
||||||
|
|
||||||
|
# Context precomputation cache
|
||||||
|
self._global_cond_cache_enabled = False
|
||||||
|
self._global_cond_cache = {}
|
||||||
|
|
||||||
def forward(self,
|
def forward(self,
|
||||||
sample: torch.Tensor,
|
sample: torch.Tensor,
|
||||||
timestep: Union[torch.Tensor, float, int],
|
timestep: Union[torch.Tensor, float, int],
|
||||||
@@ -530,14 +534,20 @@ class ConditionalUnet1D(nn.Module):
|
|||||||
B, T, D = sample.shape
|
B, T, D = sample.shape
|
||||||
if self.use_linear_act_proj:
|
if self.use_linear_act_proj:
|
||||||
sample = self.proj_in_action(sample.unsqueeze(-1))
|
sample = self.proj_in_action(sample.unsqueeze(-1))
|
||||||
global_cond = self.obs_encoder(cond)
|
_gc_key = (cond['image'].data_ptr(), cond['agent_pos'].data_ptr())
|
||||||
global_cond = rearrange(global_cond,
|
if self._global_cond_cache_enabled and _gc_key in self._global_cond_cache:
|
||||||
'(b t) d -> b 1 (t d)',
|
global_cond = self._global_cond_cache[_gc_key]
|
||||||
b=B,
|
else:
|
||||||
t=self.n_obs_steps)
|
global_cond = self.obs_encoder(cond)
|
||||||
global_cond = repeat(global_cond,
|
global_cond = rearrange(global_cond,
|
||||||
'b c d -> b (repeat c) d',
|
'(b t) d -> b 1 (t d)',
|
||||||
repeat=T)
|
b=B,
|
||||||
|
t=self.n_obs_steps)
|
||||||
|
global_cond = repeat(global_cond,
|
||||||
|
'b c d -> b (repeat c) d',
|
||||||
|
repeat=T)
|
||||||
|
if self._global_cond_cache_enabled:
|
||||||
|
self._global_cond_cache[_gc_key] = global_cond
|
||||||
else:
|
else:
|
||||||
sample = einops.rearrange(sample, 'b h t -> b t h')
|
sample = einops.rearrange(sample, 'b h t -> b t h')
|
||||||
sample = self.proj_in_horizon(sample)
|
sample = self.proj_in_horizon(sample)
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ from unifolm_wma.utils.diffusion import make_ddim_sampling_parameters, make_ddim
|
|||||||
from unifolm_wma.utils.common import noise_like
|
from unifolm_wma.utils.common import noise_like
|
||||||
from unifolm_wma.utils.common import extract_into_tensor
|
from unifolm_wma.utils.common import extract_into_tensor
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from unifolm_wma.modules.attention import enable_cross_attn_kv_cache, disable_cross_attn_kv_cache
|
||||||
|
from unifolm_wma.modules.networks.wma_model import enable_ctx_cache, disable_ctx_cache
|
||||||
|
|
||||||
|
|
||||||
class DDIMSampler(object):
|
class DDIMSampler(object):
|
||||||
@@ -67,11 +69,12 @@ class DDIMSampler(object):
|
|||||||
ddim_timesteps=self.ddim_timesteps,
|
ddim_timesteps=self.ddim_timesteps,
|
||||||
eta=ddim_eta,
|
eta=ddim_eta,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
self.register_buffer('ddim_sigmas', ddim_sigmas)
|
# Ensure tensors are on correct device for efficient indexing
|
||||||
self.register_buffer('ddim_alphas', ddim_alphas)
|
self.register_buffer('ddim_sigmas', to_torch(torch.as_tensor(ddim_sigmas)))
|
||||||
self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
|
self.register_buffer('ddim_alphas', to_torch(torch.as_tensor(ddim_alphas)))
|
||||||
|
self.register_buffer('ddim_alphas_prev', to_torch(torch.as_tensor(ddim_alphas_prev)))
|
||||||
self.register_buffer('ddim_sqrt_one_minus_alphas',
|
self.register_buffer('ddim_sqrt_one_minus_alphas',
|
||||||
np.sqrt(1. - ddim_alphas))
|
to_torch(torch.as_tensor(np.sqrt(1. - ddim_alphas))))
|
||||||
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
|
||||||
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) *
|
(1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) *
|
||||||
(1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
(1 - self.alphas_cumprod / self.alphas_cumprod_prev))
|
||||||
@@ -241,63 +244,70 @@ class DDIMSampler(object):
|
|||||||
|
|
||||||
dp_ddim_scheduler_action.set_timesteps(len(timesteps))
|
dp_ddim_scheduler_action.set_timesteps(len(timesteps))
|
||||||
dp_ddim_scheduler_state.set_timesteps(len(timesteps))
|
dp_ddim_scheduler_state.set_timesteps(len(timesteps))
|
||||||
for i, step in enumerate(iterator):
|
ts = torch.empty((b, ), device=device, dtype=torch.long)
|
||||||
index = total_steps - i - 1
|
enable_cross_attn_kv_cache(self.model)
|
||||||
ts = torch.full((b, ), step, device=device, dtype=torch.long)
|
enable_ctx_cache(self.model)
|
||||||
|
try:
|
||||||
|
for i, step in enumerate(iterator):
|
||||||
|
index = total_steps - i - 1
|
||||||
|
ts.fill_(step)
|
||||||
|
|
||||||
# Use mask to blend noised original latent (img_orig) & new sampled latent (img)
|
# Use mask to blend noised original latent (img_orig) & new sampled latent (img)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
assert x0 is not None
|
assert x0 is not None
|
||||||
if clean_cond:
|
if clean_cond:
|
||||||
img_orig = x0
|
img_orig = x0
|
||||||
else:
|
else:
|
||||||
img_orig = self.model.q_sample(x0, ts)
|
img_orig = self.model.q_sample(x0, ts)
|
||||||
img = img_orig * mask + (1. - mask) * img
|
img = img_orig * mask + (1. - mask) * img
|
||||||
|
|
||||||
outs = self.p_sample_ddim(
|
outs = self.p_sample_ddim(
|
||||||
img,
|
img,
|
||||||
action,
|
action,
|
||||||
state,
|
state,
|
||||||
cond,
|
cond,
|
||||||
ts,
|
ts,
|
||||||
index=index,
|
index=index,
|
||||||
use_original_steps=ddim_use_original_steps,
|
use_original_steps=ddim_use_original_steps,
|
||||||
quantize_denoised=quantize_denoised,
|
quantize_denoised=quantize_denoised,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
noise_dropout=noise_dropout,
|
noise_dropout=noise_dropout,
|
||||||
score_corrector=score_corrector,
|
score_corrector=score_corrector,
|
||||||
corrector_kwargs=corrector_kwargs,
|
corrector_kwargs=corrector_kwargs,
|
||||||
unconditional_guidance_scale=unconditional_guidance_scale,
|
unconditional_guidance_scale=unconditional_guidance_scale,
|
||||||
unconditional_conditioning=unconditional_conditioning,
|
unconditional_conditioning=unconditional_conditioning,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
x0=x0,
|
x0=x0,
|
||||||
fs=fs,
|
fs=fs,
|
||||||
guidance_rescale=guidance_rescale,
|
guidance_rescale=guidance_rescale,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
img, pred_x0, model_output_action, model_output_state = outs
|
img, pred_x0, model_output_action, model_output_state = outs
|
||||||
|
|
||||||
action = dp_ddim_scheduler_action.step(
|
action = dp_ddim_scheduler_action.step(
|
||||||
model_output_action,
|
model_output_action,
|
||||||
step,
|
step,
|
||||||
action,
|
action,
|
||||||
generator=None,
|
generator=None,
|
||||||
).prev_sample
|
).prev_sample
|
||||||
state = dp_ddim_scheduler_state.step(
|
state = dp_ddim_scheduler_state.step(
|
||||||
model_output_state,
|
model_output_state,
|
||||||
step,
|
step,
|
||||||
state,
|
state,
|
||||||
generator=None,
|
generator=None,
|
||||||
).prev_sample
|
).prev_sample
|
||||||
|
|
||||||
if callback: callback(i)
|
if callback: callback(i)
|
||||||
if img_callback: img_callback(pred_x0, i)
|
if img_callback: img_callback(pred_x0, i)
|
||||||
|
|
||||||
if index % log_every_t == 0 or index == total_steps - 1:
|
if index % log_every_t == 0 or index == total_steps - 1:
|
||||||
intermediates['x_inter'].append(img)
|
intermediates['x_inter'].append(img)
|
||||||
intermediates['pred_x0'].append(pred_x0)
|
intermediates['pred_x0'].append(pred_x0)
|
||||||
intermediates['x_inter_action'].append(action)
|
intermediates['x_inter_action'].append(action)
|
||||||
intermediates['x_inter_state'].append(state)
|
intermediates['x_inter_state'].append(state)
|
||||||
|
finally:
|
||||||
|
disable_cross_attn_kv_cache(self.model)
|
||||||
|
disable_ctx_cache(self.model)
|
||||||
|
|
||||||
return img, action, state, intermediates
|
return img, action, state, intermediates
|
||||||
|
|
||||||
@@ -325,10 +335,6 @@ class DDIMSampler(object):
|
|||||||
guidance_rescale=0.0,
|
guidance_rescale=0.0,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
b, *_, device = *x.shape, x.device
|
b, *_, device = *x.shape, x.device
|
||||||
if x.dim() == 5:
|
|
||||||
is_video = True
|
|
||||||
else:
|
|
||||||
is_video = False
|
|
||||||
|
|
||||||
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
|
||||||
model_output, model_output_action, model_output_state = self.model.apply_model(
|
model_output, model_output_action, model_output_state = self.model.apply_model(
|
||||||
@@ -377,17 +383,11 @@ class DDIMSampler(object):
|
|||||||
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
|
||||||
sigmas = self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
sigmas = self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
|
||||||
|
|
||||||
if is_video:
|
# Use 0-d tensors directly (already on device); broadcasting handles shape
|
||||||
size = (b, 1, 1, 1, 1)
|
a_t = alphas[index]
|
||||||
else:
|
a_prev = alphas_prev[index]
|
||||||
size = (b, 1, 1, 1)
|
sigma_t = sigmas[index]
|
||||||
|
sqrt_one_minus_at = sqrt_one_minus_alphas[index]
|
||||||
a_t = torch.full(size, alphas[index], device=device)
|
|
||||||
a_prev = torch.full(size, alphas_prev[index], device=device)
|
|
||||||
sigma_t = torch.full(size, sigmas[index], device=device)
|
|
||||||
sqrt_one_minus_at = torch.full(size,
|
|
||||||
sqrt_one_minus_alphas[index],
|
|
||||||
device=device)
|
|
||||||
|
|
||||||
if self.model.parameterization != "v":
|
if self.model.parameterization != "v":
|
||||||
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
|
||||||
@@ -395,12 +395,8 @@ class DDIMSampler(object):
|
|||||||
pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
|
pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
|
||||||
|
|
||||||
if self.model.use_dynamic_rescale:
|
if self.model.use_dynamic_rescale:
|
||||||
scale_t = torch.full(size,
|
scale_t = self.ddim_scale_arr[index]
|
||||||
self.ddim_scale_arr[index],
|
prev_scale_t = self.ddim_scale_arr_prev[index]
|
||||||
device=device)
|
|
||||||
prev_scale_t = torch.full(size,
|
|
||||||
self.ddim_scale_arr_prev[index],
|
|
||||||
device=device)
|
|
||||||
rescale = (prev_scale_t / scale_t)
|
rescale = (prev_scale_t / scale_t)
|
||||||
pred_x0 *= rescale
|
pred_x0 *= rescale
|
||||||
|
|
||||||
|
|||||||
@@ -98,6 +98,10 @@ class CrossAttention(nn.Module):
|
|||||||
self.text_context_len = text_context_len
|
self.text_context_len = text_context_len
|
||||||
self.agent_state_context_len = agent_state_context_len
|
self.agent_state_context_len = agent_state_context_len
|
||||||
self.agent_action_context_len = agent_action_context_len
|
self.agent_action_context_len = agent_action_context_len
|
||||||
|
self._kv_cache = {}
|
||||||
|
self._kv_cache_enabled = False
|
||||||
|
self._kv_fused = False
|
||||||
|
|
||||||
self.cross_attention_scale_learnable = cross_attention_scale_learnable
|
self.cross_attention_scale_learnable = cross_attention_scale_learnable
|
||||||
if self.image_cross_attention:
|
if self.image_cross_attention:
|
||||||
self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
|
self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
|
||||||
@@ -114,6 +118,27 @@ class CrossAttention(nn.Module):
|
|||||||
self.register_parameter('alpha_caa',
|
self.register_parameter('alpha_caa',
|
||||||
nn.Parameter(torch.tensor(0.)))
|
nn.Parameter(torch.tensor(0.)))
|
||||||
|
|
||||||
|
def fuse_kv(self):
|
||||||
|
"""Fuse to_k/to_v into to_kv (2 Linear → 1). Works for all layers."""
|
||||||
|
k_w = self.to_k.weight # (inner_dim, context_dim)
|
||||||
|
v_w = self.to_v.weight
|
||||||
|
self.to_kv = nn.Linear(k_w.shape[1], k_w.shape[0] * 2, bias=False)
|
||||||
|
self.to_kv.weight = nn.Parameter(torch.cat([k_w, v_w], dim=0))
|
||||||
|
del self.to_k, self.to_v
|
||||||
|
if self.image_cross_attention:
|
||||||
|
for suffix in ('_ip', '_as', '_aa'):
|
||||||
|
k_attr = f'to_k{suffix}'
|
||||||
|
v_attr = f'to_v{suffix}'
|
||||||
|
kw = getattr(self, k_attr).weight
|
||||||
|
vw = getattr(self, v_attr).weight
|
||||||
|
fused = nn.Linear(kw.shape[1], kw.shape[0] * 2, bias=False)
|
||||||
|
fused.weight = nn.Parameter(torch.cat([kw, vw], dim=0))
|
||||||
|
setattr(self, f'to_kv{suffix}', fused)
|
||||||
|
delattr(self, k_attr)
|
||||||
|
delattr(self, v_attr)
|
||||||
|
self._kv_fused = True
|
||||||
|
return True
|
||||||
|
|
||||||
def forward(self, x, context=None, mask=None):
|
def forward(self, x, context=None, mask=None):
|
||||||
spatial_self_attn = (context is None)
|
spatial_self_attn = (context is None)
|
||||||
k_ip, v_ip, out_ip = None, None, None
|
k_ip, v_ip, out_ip = None, None, None
|
||||||
@@ -140,19 +165,28 @@ class CrossAttention(nn.Module):
|
|||||||
self.agent_action_context_len +
|
self.agent_action_context_len +
|
||||||
self.text_context_len:, :]
|
self.text_context_len:, :]
|
||||||
|
|
||||||
k = self.to_k(context_ins)
|
if self._kv_fused:
|
||||||
v = self.to_v(context_ins)
|
k, v = self.to_kv(context_ins).chunk(2, dim=-1)
|
||||||
k_ip = self.to_k_ip(context_image)
|
k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
|
||||||
v_ip = self.to_v_ip(context_image)
|
k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
|
||||||
k_as = self.to_k_as(context_agent_state)
|
k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
|
||||||
v_as = self.to_v_as(context_agent_state)
|
else:
|
||||||
k_aa = self.to_k_aa(context_agent_action)
|
k = self.to_k(context_ins)
|
||||||
v_aa = self.to_v_aa(context_agent_action)
|
v = self.to_v(context_ins)
|
||||||
|
k_ip = self.to_k_ip(context_image)
|
||||||
|
v_ip = self.to_v_ip(context_image)
|
||||||
|
k_as = self.to_k_as(context_agent_state)
|
||||||
|
v_as = self.to_v_as(context_agent_state)
|
||||||
|
k_aa = self.to_k_aa(context_agent_action)
|
||||||
|
v_aa = self.to_v_aa(context_agent_action)
|
||||||
else:
|
else:
|
||||||
if not spatial_self_attn:
|
if not spatial_self_attn:
|
||||||
context = context[:, :self.text_context_len, :]
|
context = context[:, :self.text_context_len, :]
|
||||||
k = self.to_k(context)
|
if self._kv_fused:
|
||||||
v = self.to_v(context)
|
k, v = self.to_kv(context).chunk(2, dim=-1)
|
||||||
|
else:
|
||||||
|
k = self.to_k(context)
|
||||||
|
v = self.to_v(context)
|
||||||
|
|
||||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
|
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
|
||||||
(q, k, v))
|
(q, k, v))
|
||||||
@@ -236,134 +270,162 @@ class CrossAttention(nn.Module):
|
|||||||
k_ip, v_ip, out_ip = None, None, None
|
k_ip, v_ip, out_ip = None, None, None
|
||||||
k_as, v_as, out_as = None, None, None
|
k_as, v_as, out_as = None, None, None
|
||||||
k_aa, v_aa, out_aa = None, None, None
|
k_aa, v_aa, out_aa = None, None, None
|
||||||
|
attn_mask_aa = None
|
||||||
|
|
||||||
|
h = self.heads
|
||||||
q = self.to_q(x)
|
q = self.to_q(x)
|
||||||
context = default(context, x)
|
context = default(context, x)
|
||||||
|
|
||||||
if self.image_cross_attention and not spatial_self_attn:
|
b, _, _ = q.shape
|
||||||
|
q = q.unsqueeze(3).reshape(b, q.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, q.shape[1], self.dim_head).contiguous()
|
||||||
|
|
||||||
|
def _reshape_kv(t):
|
||||||
|
return t.unsqueeze(3).reshape(b, t.shape[1], h, self.dim_head).permute(0, 2, 1, 3).reshape(b * h, t.shape[1], self.dim_head).contiguous()
|
||||||
|
|
||||||
|
use_cache = self._kv_cache_enabled and not spatial_self_attn
|
||||||
|
cache_hit = use_cache and len(self._kv_cache) > 0
|
||||||
|
|
||||||
|
if cache_hit:
|
||||||
|
k = self._kv_cache['k']
|
||||||
|
v = self._kv_cache['v']
|
||||||
|
k_ip = self._kv_cache.get('k_ip')
|
||||||
|
v_ip = self._kv_cache.get('v_ip')
|
||||||
|
k_as = self._kv_cache.get('k_as')
|
||||||
|
v_as = self._kv_cache.get('v_as')
|
||||||
|
k_aa = self._kv_cache.get('k_aa')
|
||||||
|
v_aa = self._kv_cache.get('v_aa')
|
||||||
|
attn_mask_aa = self._kv_cache.get('attn_mask_aa')
|
||||||
|
elif self.image_cross_attention and not spatial_self_attn:
|
||||||
if context.shape[1] == self.text_context_len + self.video_length:
|
if context.shape[1] == self.text_context_len + self.video_length:
|
||||||
context_ins, context_image = context[:, :self.text_context_len, :], context[:,self.text_context_len:, :]
|
context_ins, context_image = context[:, :self.text_context_len, :], context[:,self.text_context_len:, :]
|
||||||
k = self.to_k(context)
|
if self._kv_fused:
|
||||||
v = self.to_v(context)
|
k, v = self.to_kv(context).chunk(2, dim=-1)
|
||||||
k_ip = self.to_k_ip(context_image)
|
k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
|
||||||
v_ip = self.to_v_ip(context_image)
|
else:
|
||||||
|
k = self.to_k(context)
|
||||||
|
v = self.to_v(context)
|
||||||
|
k_ip = self.to_k_ip(context_image)
|
||||||
|
v_ip = self.to_v_ip(context_image)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip}
|
||||||
elif context.shape[1] == self.agent_state_context_len + self.text_context_len + self.video_length:
|
elif context.shape[1] == self.agent_state_context_len + self.text_context_len + self.video_length:
|
||||||
context_agent_state = context[:, :self.agent_state_context_len, :]
|
context_agent_state = context[:, :self.agent_state_context_len, :]
|
||||||
context_ins = context[:, self.agent_state_context_len:self.agent_state_context_len+self.text_context_len, :]
|
context_ins = context[:, self.agent_state_context_len:self.agent_state_context_len+self.text_context_len, :]
|
||||||
context_image = context[:, self.agent_state_context_len+self.text_context_len:, :]
|
context_image = context[:, self.agent_state_context_len+self.text_context_len:, :]
|
||||||
k = self.to_k(context_ins)
|
if self._kv_fused:
|
||||||
v = self.to_v(context_ins)
|
k, v = self.to_kv(context_ins).chunk(2, dim=-1)
|
||||||
k_ip = self.to_k_ip(context_image)
|
k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
|
||||||
v_ip = self.to_v_ip(context_image)
|
k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
|
||||||
k_as = self.to_k_as(context_agent_state)
|
else:
|
||||||
v_as = self.to_v_as(context_agent_state)
|
k = self.to_k(context_ins)
|
||||||
|
v = self.to_v(context_ins)
|
||||||
|
k_ip = self.to_k_ip(context_image)
|
||||||
|
v_ip = self.to_v_ip(context_image)
|
||||||
|
k_as = self.to_k_as(context_agent_state)
|
||||||
|
v_as = self.to_v_as(context_agent_state)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
|
k_as, v_as = map(_reshape_kv, (k_as, v_as))
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip, 'k_as': k_as, 'v_as': v_as}
|
||||||
else:
|
else:
|
||||||
context_agent_state = context[:, :self.agent_state_context_len, :]
|
context_agent_state = context[:, :self.agent_state_context_len, :]
|
||||||
context_agent_action = context[:, self.agent_state_context_len:self.agent_state_context_len+self.agent_action_context_len, :]
|
context_agent_action = context[:, self.agent_state_context_len:self.agent_state_context_len+self.agent_action_context_len, :]
|
||||||
context_ins = context[:, self.agent_state_context_len+self.agent_action_context_len:self.agent_state_context_len+self.agent_action_context_len+self.text_context_len, :]
|
context_ins = context[:, self.agent_state_context_len+self.agent_action_context_len:self.agent_state_context_len+self.agent_action_context_len+self.text_context_len, :]
|
||||||
context_image = context[:, self.agent_state_context_len+self.agent_action_context_len+self.text_context_len:, :]
|
context_image = context[:, self.agent_state_context_len+self.agent_action_context_len+self.text_context_len:, :]
|
||||||
|
|
||||||
k = self.to_k(context_ins)
|
if self._kv_fused:
|
||||||
v = self.to_v(context_ins)
|
k, v = self.to_kv(context_ins).chunk(2, dim=-1)
|
||||||
k_ip = self.to_k_ip(context_image)
|
k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
|
||||||
v_ip = self.to_v_ip(context_image)
|
k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
|
||||||
k_as = self.to_k_as(context_agent_state)
|
k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
|
||||||
v_as = self.to_v_as(context_agent_state)
|
else:
|
||||||
k_aa = self.to_k_aa(context_agent_action)
|
k = self.to_k(context_ins)
|
||||||
v_aa = self.to_v_aa(context_agent_action)
|
v = self.to_v(context_ins)
|
||||||
|
k_ip = self.to_k_ip(context_image)
|
||||||
|
v_ip = self.to_v_ip(context_image)
|
||||||
|
k_as = self.to_k_as(context_agent_state)
|
||||||
|
v_as = self.to_v_as(context_agent_state)
|
||||||
|
k_aa = self.to_k_aa(context_agent_action)
|
||||||
|
v_aa = self.to_v_aa(context_agent_action)
|
||||||
|
|
||||||
attn_mask_aa = self._get_attn_mask_aa(x.shape[0],
|
k, v = map(_reshape_kv, (k, v))
|
||||||
q.shape[1],
|
k_ip, v_ip = map(_reshape_kv, (k_ip, v_ip))
|
||||||
k_aa.shape[1],
|
k_as, v_as = map(_reshape_kv, (k_as, v_as))
|
||||||
block_size=16).to(k_aa.device)
|
k_aa, v_aa = map(_reshape_kv, (k_aa, v_aa))
|
||||||
|
|
||||||
|
attn_mask_aa_raw = self._get_attn_mask_aa(x.shape[0],
|
||||||
|
q.shape[1],
|
||||||
|
k_aa.shape[1],
|
||||||
|
block_size=16,
|
||||||
|
device=k_aa.device)
|
||||||
|
attn_mask_aa = attn_mask_aa_raw.unsqueeze(1).repeat(1, h, 1, 1).reshape(
|
||||||
|
b * h, attn_mask_aa_raw.shape[1], attn_mask_aa_raw.shape[2]).to(q.dtype)
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {
|
||||||
|
'k': k, 'v': v, 'k_ip': k_ip, 'v_ip': v_ip,
|
||||||
|
'k_as': k_as, 'v_as': v_as, 'k_aa': k_aa, 'v_aa': v_aa,
|
||||||
|
'attn_mask_aa': attn_mask_aa,
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
if not spatial_self_attn:
|
if not spatial_self_attn:
|
||||||
assert 1 > 2, ">>> ERROR: you should never go into here ..."
|
assert 1 > 2, ">>> ERROR: you should never go into here ..."
|
||||||
context = context[:, :self.text_context_len, :]
|
context = context[:, :self.text_context_len, :]
|
||||||
k = self.to_k(context)
|
if self._kv_fused:
|
||||||
v = self.to_v(context)
|
k, v = self.to_kv(context).chunk(2, dim=-1)
|
||||||
|
else:
|
||||||
b, _, _ = q.shape
|
k = self.to_k(context)
|
||||||
q = q.unsqueeze(3).reshape(b, q.shape[1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(b * self.heads, q.shape[1], self.dim_head).contiguous()
|
v = self.to_v(context)
|
||||||
|
k, v = map(_reshape_kv, (k, v))
|
||||||
|
if use_cache:
|
||||||
|
self._kv_cache = {'k': k, 'v': v}
|
||||||
if k is not None:
|
if k is not None:
|
||||||
k, v = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(),
|
|
||||||
(k, v),
|
|
||||||
)
|
|
||||||
out = xformers.ops.memory_efficient_attention(q,
|
out = xformers.ops.memory_efficient_attention(q,
|
||||||
k,
|
k,
|
||||||
v,
|
v,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out = (out.unsqueeze(0).reshape(
|
out = (out.unsqueeze(0).reshape(
|
||||||
b, self.heads, out.shape[1],
|
b, h, out.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out.shape[1],
|
3).reshape(b, out.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_ip is not None:
|
if k_ip is not None:
|
||||||
# For image cross-attention
|
|
||||||
k_ip, v_ip = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_ip, v_ip),
|
|
||||||
)
|
|
||||||
out_ip = xformers.ops.memory_efficient_attention(q,
|
out_ip = xformers.ops.memory_efficient_attention(q,
|
||||||
k_ip,
|
k_ip,
|
||||||
v_ip,
|
v_ip,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out_ip = (out_ip.unsqueeze(0).reshape(
|
out_ip = (out_ip.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_ip.shape[1],
|
b, h, out_ip.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_ip.shape[1],
|
3).reshape(b, out_ip.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_as is not None:
|
if k_as is not None:
|
||||||
# For agent state cross-attention
|
|
||||||
k_as, v_as = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_as, v_as),
|
|
||||||
)
|
|
||||||
out_as = xformers.ops.memory_efficient_attention(q,
|
out_as = xformers.ops.memory_efficient_attention(q,
|
||||||
k_as,
|
k_as,
|
||||||
v_as,
|
v_as,
|
||||||
attn_bias=None,
|
attn_bias=None,
|
||||||
op=None)
|
op=None)
|
||||||
out_as = (out_as.unsqueeze(0).reshape(
|
out_as = (out_as.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_as.shape[1],
|
b, h, out_as.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_as.shape[1],
|
3).reshape(b, out_as.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
|
|
||||||
if k_aa is not None:
|
if k_aa is not None:
|
||||||
# For agent action cross-attention
|
|
||||||
k_aa, v_aa = map(
|
|
||||||
lambda t: t.unsqueeze(3).reshape(b, t.shape[
|
|
||||||
1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
|
|
||||||
b * self.heads, t.shape[1], self.dim_head).contiguous(
|
|
||||||
),
|
|
||||||
(k_aa, v_aa),
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_mask_aa = attn_mask_aa.unsqueeze(1).repeat(1,self.heads,1,1).reshape(
|
|
||||||
b * self.heads, attn_mask_aa.shape[1], attn_mask_aa.shape[2])
|
|
||||||
attn_mask_aa = attn_mask_aa.to(q.dtype)
|
|
||||||
|
|
||||||
out_aa = xformers.ops.memory_efficient_attention(
|
out_aa = xformers.ops.memory_efficient_attention(
|
||||||
q, k_aa, v_aa, attn_bias=attn_mask_aa, op=None)
|
q, k_aa, v_aa, attn_bias=attn_mask_aa, op=None)
|
||||||
|
|
||||||
out_aa = (out_aa.unsqueeze(0).reshape(
|
out_aa = (out_aa.unsqueeze(0).reshape(
|
||||||
b, self.heads, out_aa.shape[1],
|
b, h, out_aa.shape[1],
|
||||||
self.dim_head).permute(0, 2, 1,
|
self.dim_head).permute(0, 2, 1,
|
||||||
3).reshape(b, out_aa.shape[1],
|
3).reshape(b, out_aa.shape[1],
|
||||||
self.heads * self.dim_head))
|
h * self.dim_head))
|
||||||
if exists(mask):
|
if exists(mask):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -386,17 +448,43 @@ class CrossAttention(nn.Module):
|
|||||||
|
|
||||||
return self.to_out(out)
|
return self.to_out(out)
|
||||||
|
|
||||||
def _get_attn_mask_aa(self, b, l1, l2, block_size=16):
|
def _get_attn_mask_aa(self, b, l1, l2, block_size=16, device=None):
|
||||||
|
cache_key = (b, l1, l2, block_size)
|
||||||
|
if hasattr(self, '_attn_mask_aa_cache_key') and self._attn_mask_aa_cache_key == cache_key:
|
||||||
|
cached = self._attn_mask_aa_cache
|
||||||
|
if device is not None and cached.device != torch.device(device):
|
||||||
|
cached = cached.to(device)
|
||||||
|
self._attn_mask_aa_cache = cached
|
||||||
|
return cached
|
||||||
|
|
||||||
|
target_device = device if device is not None else 'cpu'
|
||||||
num_token = l2 // block_size
|
num_token = l2 // block_size
|
||||||
start_positions = ((torch.arange(b) % block_size) + 1) * num_token
|
start_positions = ((torch.arange(b, device=target_device) % block_size) + 1) * num_token
|
||||||
col_indices = torch.arange(l2)
|
col_indices = torch.arange(l2, device=target_device)
|
||||||
mask_2d = col_indices.unsqueeze(0) >= start_positions.unsqueeze(1)
|
mask_2d = col_indices.unsqueeze(0) >= start_positions.unsqueeze(1)
|
||||||
mask = mask_2d.unsqueeze(1).expand(b, l1, l2)
|
mask = mask_2d.unsqueeze(1).expand(b, l1, l2)
|
||||||
attn_mask = torch.zeros_like(mask, dtype=torch.float)
|
attn_mask = torch.zeros(b, l1, l2, dtype=torch.float, device=target_device)
|
||||||
attn_mask[mask] = float('-inf')
|
attn_mask[mask] = float('-inf')
|
||||||
|
|
||||||
|
self._attn_mask_aa_cache_key = cache_key
|
||||||
|
self._attn_mask_aa_cache = attn_mask
|
||||||
return attn_mask
|
return attn_mask
|
||||||
|
|
||||||
|
|
||||||
|
def enable_cross_attn_kv_cache(module):
|
||||||
|
for m in module.modules():
|
||||||
|
if isinstance(m, CrossAttention):
|
||||||
|
m._kv_cache_enabled = True
|
||||||
|
m._kv_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def disable_cross_attn_kv_cache(module):
|
||||||
|
for m in module.modules():
|
||||||
|
if isinstance(m, CrossAttention):
|
||||||
|
m._kv_cache_enabled = False
|
||||||
|
m._kv_cache = {}
|
||||||
|
|
||||||
|
|
||||||
class BasicTransformerBlock(nn.Module):
|
class BasicTransformerBlock(nn.Module):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
|||||||
@@ -685,6 +685,28 @@ class WMAModel(nn.Module):
|
|||||||
self.action_token_projector = instantiate_from_config(
|
self.action_token_projector = instantiate_from_config(
|
||||||
stem_process_config)
|
stem_process_config)
|
||||||
|
|
||||||
|
# Context precomputation cache
|
||||||
|
self._ctx_cache_enabled = False
|
||||||
|
self._ctx_cache = {}
|
||||||
|
self._trt_backbone = None # TRT engine for video UNet backbone
|
||||||
|
# Reusable CUDA stream for parallel state_unet / action_unet
|
||||||
|
self._state_stream = torch.cuda.Stream()
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
state.pop('_state_stream', None)
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
self.__dict__.update(state)
|
||||||
|
self._state_stream = torch.cuda.Stream()
|
||||||
|
|
||||||
|
def load_trt_backbone(self, engine_path, n_hs_a=9):
|
||||||
|
"""Load a TensorRT engine for the video UNet backbone."""
|
||||||
|
from unifolm_wma.trt_utils import TRTBackbone
|
||||||
|
self._trt_backbone = TRTBackbone(engine_path, n_hs_a=n_hs_a)
|
||||||
|
print(f">>> TRT backbone loaded from {engine_path}")
|
||||||
|
|
||||||
def forward(self,
|
def forward(self,
|
||||||
x: Tensor,
|
x: Tensor,
|
||||||
x_action: Tensor,
|
x_action: Tensor,
|
||||||
@@ -720,58 +742,64 @@ class WMAModel(nn.Module):
|
|||||||
repeat_only=False).type(x.dtype)
|
repeat_only=False).type(x.dtype)
|
||||||
emb = self.time_embed(t_emb)
|
emb = self.time_embed(t_emb)
|
||||||
|
|
||||||
bt, l_context, _ = context.shape
|
_ctx_key = context.data_ptr()
|
||||||
if self.base_model_gen_only:
|
if self._ctx_cache_enabled and _ctx_key in self._ctx_cache:
|
||||||
assert l_context == 77 + self.n_obs_steps * 16, ">>> ERROR Context dim 1 ..." ## NOTE HANDCODE
|
context = self._ctx_cache[_ctx_key]
|
||||||
else:
|
else:
|
||||||
if l_context == self.n_obs_steps + 77 + t * 16:
|
bt, l_context, _ = context.shape
|
||||||
context_agent_state = context[:, :self.n_obs_steps]
|
if self.base_model_gen_only:
|
||||||
context_text = context[:, self.n_obs_steps:self.n_obs_steps +
|
assert l_context == 77 + self.n_obs_steps * 16, ">>> ERROR Context dim 1 ..." ## NOTE HANDCODE
|
||||||
77, :]
|
else:
|
||||||
context_img = context[:, self.n_obs_steps + 77:, :]
|
if l_context == self.n_obs_steps + 77 + t * 16:
|
||||||
context_agent_state = context_agent_state.repeat_interleave(
|
context_agent_state = context[:, :self.n_obs_steps]
|
||||||
repeats=t, dim=0)
|
context_text = context[:, self.n_obs_steps:self.n_obs_steps +
|
||||||
context_text = context_text.repeat_interleave(repeats=t, dim=0)
|
77, :]
|
||||||
context_img = rearrange(context_img,
|
context_img = context[:, self.n_obs_steps + 77:, :]
|
||||||
'b (t l) c -> (b t) l c',
|
context_agent_state = context_agent_state.repeat_interleave(
|
||||||
t=t)
|
repeats=t, dim=0)
|
||||||
context = torch.cat(
|
context_text = context_text.repeat_interleave(repeats=t, dim=0)
|
||||||
[context_agent_state, context_text, context_img], dim=1)
|
context_img = rearrange(context_img,
|
||||||
elif l_context == self.n_obs_steps + 16 + 77 + t * 16:
|
'b (t l) c -> (b t) l c',
|
||||||
context_agent_state = context[:, :self.n_obs_steps]
|
t=t)
|
||||||
context_agent_action = context[:, self.
|
context = torch.cat(
|
||||||
n_obs_steps:self.n_obs_steps +
|
[context_agent_state, context_text, context_img], dim=1)
|
||||||
16, :]
|
elif l_context == self.n_obs_steps + 16 + 77 + t * 16:
|
||||||
context_agent_action = rearrange(
|
context_agent_state = context[:, :self.n_obs_steps]
|
||||||
context_agent_action.unsqueeze(2), 'b t l d -> (b t) l d')
|
context_agent_action = context[:, self.
|
||||||
context_agent_action = self.action_token_projector(
|
n_obs_steps:self.n_obs_steps +
|
||||||
context_agent_action)
|
16, :]
|
||||||
context_agent_action = rearrange(context_agent_action,
|
context_agent_action = rearrange(
|
||||||
'(b o) l d -> b o l d',
|
context_agent_action.unsqueeze(2), 'b t l d -> (b t) l d')
|
||||||
o=t)
|
context_agent_action = self.action_token_projector(
|
||||||
context_agent_action = rearrange(context_agent_action,
|
context_agent_action)
|
||||||
'b o (t l) d -> b o t l d',
|
context_agent_action = rearrange(context_agent_action,
|
||||||
t=t)
|
'(b o) l d -> b o l d',
|
||||||
context_agent_action = context_agent_action.permute(
|
o=t)
|
||||||
0, 2, 1, 3, 4)
|
context_agent_action = rearrange(context_agent_action,
|
||||||
context_agent_action = rearrange(context_agent_action,
|
'b o (t l) d -> b o t l d',
|
||||||
'b t o l d -> (b t) (o l) d')
|
t=t)
|
||||||
|
context_agent_action = context_agent_action.permute(
|
||||||
|
0, 2, 1, 3, 4)
|
||||||
|
context_agent_action = rearrange(context_agent_action,
|
||||||
|
'b t o l d -> (b t) (o l) d')
|
||||||
|
|
||||||
context_text = context[:, self.n_obs_steps +
|
context_text = context[:, self.n_obs_steps +
|
||||||
16:self.n_obs_steps + 16 + 77, :]
|
16:self.n_obs_steps + 16 + 77, :]
|
||||||
context_text = context_text.repeat_interleave(repeats=t, dim=0)
|
context_text = context_text.repeat_interleave(repeats=t, dim=0)
|
||||||
|
|
||||||
context_img = context[:, self.n_obs_steps + 16 + 77:, :]
|
context_img = context[:, self.n_obs_steps + 16 + 77:, :]
|
||||||
context_img = rearrange(context_img,
|
context_img = rearrange(context_img,
|
||||||
'b (t l) c -> (b t) l c',
|
'b (t l) c -> (b t) l c',
|
||||||
t=t)
|
t=t)
|
||||||
context_agent_state = context_agent_state.repeat_interleave(
|
context_agent_state = context_agent_state.repeat_interleave(
|
||||||
repeats=t, dim=0)
|
repeats=t, dim=0)
|
||||||
context = torch.cat([
|
context = torch.cat([
|
||||||
context_agent_state, context_agent_action, context_text,
|
context_agent_state, context_agent_action, context_text,
|
||||||
context_img
|
context_img
|
||||||
],
|
],
|
||||||
dim=1)
|
dim=1)
|
||||||
|
if self._ctx_cache_enabled:
|
||||||
|
self._ctx_cache[_ctx_key] = context
|
||||||
|
|
||||||
emb = emb.repeat_interleave(repeats=t, dim=0)
|
emb = emb.repeat_interleave(repeats=t, dim=0)
|
||||||
|
|
||||||
@@ -791,58 +819,92 @@ class WMAModel(nn.Module):
|
|||||||
fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
|
fs_embed = fs_embed.repeat_interleave(repeats=t, dim=0)
|
||||||
emb = emb + fs_embed
|
emb = emb + fs_embed
|
||||||
|
|
||||||
h = x.type(self.dtype)
|
if self._trt_backbone is not None:
|
||||||
adapter_idx = 0
|
# TRT path: run backbone via TensorRT engine
|
||||||
hs = []
|
h_in = x.type(self.dtype).contiguous()
|
||||||
hs_a = []
|
y, hs_a = self._trt_backbone(h_in, emb.contiguous(), context.contiguous())
|
||||||
for id, module in enumerate(self.input_blocks):
|
else:
|
||||||
h = module(h, emb, context=context, batch_size=b)
|
# PyTorch path: original backbone
|
||||||
if id == 0 and self.addition_attention:
|
h = x.type(self.dtype)
|
||||||
h = self.init_attn(h, emb, context=context, batch_size=b)
|
adapter_idx = 0
|
||||||
# plug-in adapter features
|
hs = []
|
||||||
if ((id + 1) % 3 == 0) and features_adapter is not None:
|
hs_a = []
|
||||||
h = h + features_adapter[adapter_idx]
|
for id, module in enumerate(self.input_blocks):
|
||||||
adapter_idx += 1
|
h = module(h, emb, context=context, batch_size=b)
|
||||||
if id != 0:
|
if id == 0 and self.addition_attention:
|
||||||
if isinstance(module[0], Downsample):
|
h = self.init_attn(h, emb, context=context, batch_size=b)
|
||||||
|
# plug-in adapter features
|
||||||
|
if ((id + 1) % 3 == 0) and features_adapter is not None:
|
||||||
|
h = h + features_adapter[adapter_idx]
|
||||||
|
adapter_idx += 1
|
||||||
|
if id != 0:
|
||||||
|
if isinstance(module[0], Downsample):
|
||||||
|
hs_a.append(
|
||||||
|
rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
|
hs.append(h)
|
||||||
|
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
|
if features_adapter is not None:
|
||||||
|
assert len(
|
||||||
|
features_adapter) == adapter_idx, 'Wrong features_adapter'
|
||||||
|
h = self.middle_block(h, emb, context=context, batch_size=b)
|
||||||
|
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
|
hs_out = []
|
||||||
|
for module in self.output_blocks:
|
||||||
|
h = torch.cat([h, hs.pop()], dim=1)
|
||||||
|
h = module(h, emb, context=context, batch_size=b)
|
||||||
|
if isinstance(module[-1], Upsample):
|
||||||
hs_a.append(
|
hs_a.append(
|
||||||
rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
|
rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
hs.append(h)
|
hs_out.append(h)
|
||||||
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
h = h.type(x.dtype)
|
||||||
|
hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
if features_adapter is not None:
|
y = self.out(h)
|
||||||
assert len(
|
y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
|
||||||
features_adapter) == adapter_idx, 'Wrong features_adapter'
|
|
||||||
h = self.middle_block(h, emb, context=context, batch_size=b)
|
|
||||||
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
|
||||||
|
|
||||||
hs_out = []
|
|
||||||
for module in self.output_blocks:
|
|
||||||
h = torch.cat([h, hs.pop()], dim=1)
|
|
||||||
h = module(h, emb, context=context, batch_size=b)
|
|
||||||
if isinstance(module[-1], Upsample):
|
|
||||||
hs_a.append(
|
|
||||||
rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
|
||||||
hs_out.append(h)
|
|
||||||
h = h.type(x.dtype)
|
|
||||||
hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
|
||||||
|
|
||||||
y = self.out(h)
|
|
||||||
y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
|
|
||||||
|
|
||||||
if not self.base_model_gen_only:
|
if not self.base_model_gen_only:
|
||||||
ba, _, _ = x_action.shape
|
ba, _, _ = x_action.shape
|
||||||
|
ts_state = timesteps[:ba] if b > 1 else timesteps
|
||||||
|
# Run action_unet and state_unet in parallel via CUDA streams
|
||||||
|
s_stream = self._state_stream
|
||||||
|
s_stream.wait_stream(torch.cuda.current_stream())
|
||||||
|
with torch.cuda.stream(s_stream):
|
||||||
|
s_y = self.state_unet(x_state, ts_state, hs_a,
|
||||||
|
context_action[:2], **kwargs)
|
||||||
a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
|
a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
|
||||||
context_action[:2], **kwargs)
|
context_action[:2], **kwargs)
|
||||||
# Predict state
|
torch.cuda.current_stream().wait_stream(s_stream)
|
||||||
if b > 1:
|
|
||||||
s_y = self.state_unet(x_state, timesteps[:ba], hs_a,
|
|
||||||
context_action[:2], **kwargs)
|
|
||||||
else:
|
|
||||||
s_y = self.state_unet(x_state, timesteps, hs_a,
|
|
||||||
context_action[:2], **kwargs)
|
|
||||||
else:
|
else:
|
||||||
a_y = torch.zeros_like(x_action)
|
a_y = torch.zeros_like(x_action)
|
||||||
s_y = torch.zeros_like(x_state)
|
s_y = torch.zeros_like(x_state)
|
||||||
|
|
||||||
return y, a_y, s_y
|
return y, a_y, s_y
|
||||||
|
|
||||||
|
|
||||||
|
def enable_ctx_cache(model):
|
||||||
|
"""Enable context precomputation cache on WMAModel and its action/state UNets."""
|
||||||
|
for m in model.modules():
|
||||||
|
if isinstance(m, WMAModel):
|
||||||
|
m._ctx_cache_enabled = True
|
||||||
|
m._ctx_cache = {}
|
||||||
|
# conditional_unet1d cache
|
||||||
|
from unifolm_wma.models.diffusion_head.conditional_unet1d import ConditionalUnet1D
|
||||||
|
for m in model.modules():
|
||||||
|
if isinstance(m, ConditionalUnet1D):
|
||||||
|
m._global_cond_cache_enabled = True
|
||||||
|
m._global_cond_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def disable_ctx_cache(model):
|
||||||
|
"""Disable and clear context precomputation cache."""
|
||||||
|
for m in model.modules():
|
||||||
|
if isinstance(m, WMAModel):
|
||||||
|
m._ctx_cache_enabled = False
|
||||||
|
m._ctx_cache = {}
|
||||||
|
from unifolm_wma.models.diffusion_head.conditional_unet1d import ConditionalUnet1D
|
||||||
|
for m in model.modules():
|
||||||
|
if isinstance(m, ConditionalUnet1D):
|
||||||
|
m._global_cond_cache_enabled = False
|
||||||
|
m._global_cond_cache = {}
|
||||||
|
|||||||
151
src/unifolm_wma/trt_utils.py
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
"""TensorRT acceleration utilities for the video UNet backbone."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from einops import rearrange
|
||||||
|
from unifolm_wma.modules.networks.wma_model import Downsample, Upsample
|
||||||
|
|
||||||
|
|
||||||
|
class VideoBackboneForExport(nn.Module):
|
||||||
|
"""Wrapper that isolates the video UNet backbone for ONNX export.
|
||||||
|
|
||||||
|
Takes already-preprocessed inputs (after context/time embedding prep)
|
||||||
|
and returns y + hs_a as a flat tuple.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, wma_model):
|
||||||
|
super().__init__()
|
||||||
|
self.input_blocks = wma_model.input_blocks
|
||||||
|
self.middle_block = wma_model.middle_block
|
||||||
|
self.output_blocks = wma_model.output_blocks
|
||||||
|
self.out = wma_model.out
|
||||||
|
self.addition_attention = wma_model.addition_attention
|
||||||
|
if self.addition_attention:
|
||||||
|
self.init_attn = wma_model.init_attn
|
||||||
|
self.dtype = wma_model.dtype
|
||||||
|
|
||||||
|
def forward(self, h, emb, context):
|
||||||
|
t = 16
|
||||||
|
b = 1
|
||||||
|
|
||||||
|
hs = []
|
||||||
|
hs_a = []
|
||||||
|
h = h.type(self.dtype)
|
||||||
|
for id, module in enumerate(self.input_blocks):
|
||||||
|
h = module(h, emb, context=context, batch_size=b)
|
||||||
|
if id == 0 and self.addition_attention:
|
||||||
|
h = self.init_attn(h, emb, context=context, batch_size=b)
|
||||||
|
if id != 0:
|
||||||
|
if isinstance(module[0], Downsample):
|
||||||
|
hs_a.append(rearrange(hs[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
|
hs.append(h)
|
||||||
|
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
|
h = self.middle_block(h, emb, context=context, batch_size=b)
|
||||||
|
hs_a.append(rearrange(h, '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
|
hs_out = []
|
||||||
|
for module in self.output_blocks:
|
||||||
|
h = torch.cat([h, hs.pop()], dim=1)
|
||||||
|
h = module(h, emb, context=context, batch_size=b)
|
||||||
|
if isinstance(module[-1], Upsample):
|
||||||
|
hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
|
hs_out.append(h)
|
||||||
|
hs_a.append(rearrange(hs_out[-1], '(b t) c h w -> b t c h w', t=t))
|
||||||
|
|
||||||
|
y = self.out(h.type(h.dtype))
|
||||||
|
y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
|
||||||
|
return (y, *hs_a)
|
||||||
|
|
||||||
|
|
||||||
|
def export_backbone_onnx(model, save_path, context_len=95):
|
||||||
|
wma = model.model.diffusion_model
|
||||||
|
wrapper = VideoBackboneForExport(wma)
|
||||||
|
wrapper.eval().cuda()
|
||||||
|
|
||||||
|
for m in wrapper.modules():
|
||||||
|
if hasattr(m, 'checkpoint'):
|
||||||
|
m.checkpoint = False
|
||||||
|
if hasattr(m, 'use_checkpoint'):
|
||||||
|
m.use_checkpoint = False
|
||||||
|
|
||||||
|
import xformers.ops
|
||||||
|
_orig_mea = xformers.ops.memory_efficient_attention
|
||||||
|
def _sdpa_replacement(q, k, v, attn_bias=None, op=None, **kw):
|
||||||
|
return torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
|
||||||
|
xformers.ops.memory_efficient_attention = _sdpa_replacement
|
||||||
|
|
||||||
|
BT = 16
|
||||||
|
emb_dim = wma.model_channels * 4
|
||||||
|
ctx_dim = 1024
|
||||||
|
in_ch = wma.in_channels
|
||||||
|
|
||||||
|
dummy_h = torch.randn(BT, in_ch, 40, 64, device='cuda', dtype=torch.float32)
|
||||||
|
dummy_emb = torch.randn(BT, emb_dim, device='cuda', dtype=torch.float32)
|
||||||
|
dummy_ctx = torch.randn(BT, context_len, ctx_dim, device='cuda', dtype=torch.float32)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = wrapper(dummy_h, dummy_emb, dummy_ctx)
|
||||||
|
n_outputs = len(outputs)
|
||||||
|
print(f">>> Backbone has {n_outputs} outputs (1 y + {n_outputs-1} hs_a)")
|
||||||
|
for i, o in enumerate(outputs):
|
||||||
|
print(f" output[{i}]: {o.shape} {o.dtype}")
|
||||||
|
|
||||||
|
output_names = ['y'] + [f'hs_a_{i}' for i in range(n_outputs - 1)]
|
||||||
|
|
||||||
|
torch.onnx.export(
|
||||||
|
wrapper,
|
||||||
|
(dummy_h, dummy_emb, dummy_ctx),
|
||||||
|
save_path,
|
||||||
|
input_names=['h', 'emb', 'context'],
|
||||||
|
output_names=output_names,
|
||||||
|
opset_version=17,
|
||||||
|
do_constant_folding=True,
|
||||||
|
)
|
||||||
|
print(f">>> ONNX exported to {save_path}")
|
||||||
|
xformers.ops.memory_efficient_attention = _orig_mea
|
||||||
|
return n_outputs
|
||||||
|
|
||||||
|
|
||||||
|
class TRTBackbone:
|
||||||
|
"""TensorRT runtime wrapper for the video UNet backbone."""
|
||||||
|
|
||||||
|
def __init__(self, engine_path, n_hs_a=9):
|
||||||
|
import tensorrt as trt
|
||||||
|
|
||||||
|
self.logger = trt.Logger(trt.Logger.WARNING)
|
||||||
|
with open(engine_path, 'rb') as f:
|
||||||
|
runtime = trt.Runtime(self.logger)
|
||||||
|
self.engine = runtime.deserialize_cuda_engine(f.read())
|
||||||
|
self.context = self.engine.create_execution_context()
|
||||||
|
self.n_hs_a = n_hs_a
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
self.output_buffers = {}
|
||||||
|
for i in range(self.engine.num_io_tensors):
|
||||||
|
name = self.engine.get_tensor_name(i)
|
||||||
|
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
|
||||||
|
shape = self.engine.get_tensor_shape(name)
|
||||||
|
np_dtype = trt.nptype(self.engine.get_tensor_dtype(name))
|
||||||
|
buf = torch.empty(list(shape), dtype=torch.from_numpy(np.empty(0, dtype=np_dtype)).dtype, device='cuda')
|
||||||
|
self.output_buffers[name] = buf
|
||||||
|
print(f" TRT output '{name}': {list(shape)} {buf.dtype}")
|
||||||
|
|
||||||
|
def __call__(self, h, emb, context):
|
||||||
|
import tensorrt as trt
|
||||||
|
for name, tensor in [('h', h), ('emb', emb), ('context', context)]:
|
||||||
|
expected_dtype = trt.nptype(self.engine.get_tensor_dtype(name))
|
||||||
|
torch_expected = torch.from_numpy(__import__('numpy').empty(0, dtype=expected_dtype)).dtype
|
||||||
|
if tensor.dtype != torch_expected:
|
||||||
|
tensor = tensor.to(torch_expected)
|
||||||
|
self.context.set_tensor_address(name, tensor.contiguous().data_ptr())
|
||||||
|
|
||||||
|
for name, buf in self.output_buffers.items():
|
||||||
|
self.context.set_tensor_address(name, buf.data_ptr())
|
||||||
|
|
||||||
|
self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
y = self.output_buffers['y']
|
||||||
|
hs_a = [self.output_buffers[f'hs_a_{i}'] for i in range(self.n_hs_a)]
|
||||||
|
return y, hs_a
|
||||||
179
unitree_g1_pack_camera/case1/output.log
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
2026-02-18 19:01:56.891895: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:01:56.940243: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:01:56.940285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:01:56.941395: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:01:56.948327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:01:57.870809: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:02:10] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:17<02:51, 17.15s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
|
18%|█▊ | 2/11 [00:33<02:31, 16.87s/it]
|
||||||
|
27%|██▋ | 3/11 [00:50<02:14, 16.76s/it]
|
||||||
|
36%|███▋ | 4/11 [01:07<01:57, 16.81s/it]
|
||||||
|
45%|████▌ | 5/11 [01:24<01:41, 16.85s/it]
|
||||||
|
55%|█████▍ | 6/11 [01:41<01:24, 16.82s/it]
|
||||||
|
64%|██████▎ | 7/11 [01:57<01:07, 16.82s/it]
|
||||||
|
73%|███████▎ | 8/11 [02:14<00:50, 16.83s/it]
|
||||||
|
82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
|
||||||
|
91%|█████████ | 10/11 [02:48<00:16, 16.81s/it]
|
||||||
|
100%|██████████| 11/11 [03:05<00:00, 16.81s/it]
|
||||||
|
100%|██████████| 11/11 [03:05<00:00, 16.83s/it]
|
||||||
|
>>> Step 1: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 2: generating actions ...
|
||||||
|
>>> Step 2: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 3: generating actions ...
|
||||||
|
>>> Step 3: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 4: generating actions ...
|
||||||
|
>>> Step 4: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 5: generating actions ...
|
||||||
|
>>> Step 5: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 6: generating actions ...
|
||||||
|
>>> Step 6: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 7: generating actions ...
|
||||||
|
>>> Step 7: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
5
unitree_g1_pack_camera/case1/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4",
|
||||||
|
"pred_video": "unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4",
|
||||||
|
"psnr": 35.615362167470806
|
||||||
|
}
|
||||||
24
unitree_g1_pack_camera/case1/run_world_model_interaction.sh
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_g1_pack_camera/case1"
|
||||||
|
dataset="unitree_g1_pack_camera"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_g1_pack_camera/case1/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 6 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 11 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 209 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
0,x,x,unitree_g1_pack_camera,mount camera,x,x,x,G1_Dex1,30
|
||||||
|
179
unitree_g1_pack_camera/case2/output.log
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
2026-02-18 19:05:45.956647: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:05:46.004149: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:05:46.004193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:05:46.005265: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:05:46.012074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:05:46.932966: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:05:59] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:16<02:47, 16.71s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
|
18%|█▊ | 2/11 [00:33<02:30, 16.75s/it]
|
||||||
|
27%|██▋ | 3/11 [00:50<02:15, 16.91s/it]
|
||||||
|
36%|███▋ | 4/11 [01:07<01:59, 17.02s/it]
|
||||||
|
45%|████▌ | 5/11 [01:24<01:41, 16.98s/it]
|
||||||
|
55%|█████▍ | 6/11 [01:41<01:24, 16.94s/it]
|
||||||
|
64%|██████▎ | 7/11 [01:58<01:07, 16.90s/it]
|
||||||
|
73%|███████▎ | 8/11 [02:15<00:50, 16.83s/it]
|
||||||
|
82%|████████▏ | 9/11 [02:31<00:33, 16.80s/it]
|
||||||
|
91%|█████████ | 10/11 [02:49<00:16, 16.94s/it]
|
||||||
|
100%|██████████| 11/11 [03:06<00:00, 16.97s/it]
|
||||||
|
100%|██████████| 11/11 [03:06<00:00, 16.91s/it]
|
||||||
|
>>> Step 1: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 2: generating actions ...
|
||||||
|
>>> Step 2: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 3: generating actions ...
|
||||||
|
>>> Step 3: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 4: generating actions ...
|
||||||
|
>>> Step 4: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 5: generating actions ...
|
||||||
|
>>> Step 5: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 6: generating actions ...
|
||||||
|
>>> Step 6: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 7: generating actions ...
|
||||||
|
>>> Step 7: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
5
unitree_g1_pack_camera/case2/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_g1_pack_camera/case2/unitree_g1_pack_camera_case2.mp4",
|
||||||
|
"pred_video": "unitree_g1_pack_camera/case2/output/inference/50_full_fs6.mp4",
|
||||||
|
"psnr": 34.61979248212279
|
||||||
|
}
|
||||||
24
unitree_g1_pack_camera/case2/run_world_model_interaction.sh
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_g1_pack_camera/case2"
|
||||||
|
dataset="unitree_g1_pack_camera"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_g1_pack_camera/case2/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 6 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 11 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 214 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
50,x,x,unitree_g1_pack_camera,mount camera,x,x,x,G1_Dex1,30
|
||||||
|
146
unitree_g1_pack_camera/case3/output.log
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
2026-02-18 19:09:35.113634: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-18 19:09:35.161428: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-18 19:09:35.161474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-18 19:09:35.162551: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-18 19:09:35.169325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-18 19:09:36.089250: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
>>> Loading prepared model from ckpts/unifolm_wma_dual.ckpt.prepared.pt ...
|
||||||
|
>>> Prepared model loaded.
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
✓ KV fused: 66 attention layers
|
||||||
|
TRT output 'y': [1, 4, 16, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_0': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
TRT output 'hs_a_1': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_2': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_3': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_4': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_5': [1, 16, 1280, 5, 8] torch.float32
|
||||||
|
TRT output 'hs_a_6': [1, 16, 1280, 10, 16] torch.float32
|
||||||
|
TRT output 'hs_a_7': [1, 16, 640, 20, 32] torch.float32
|
||||||
|
TRT output 'hs_a_8': [1, 16, 320, 40, 64] torch.float32
|
||||||
|
>>> TRT backbone loaded from /home/qhy/unifolm-world-model-action/scripts/evaluation/../../trt_engines/video_backbone.engine
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/11 [00:00<?, ?it/s][02/18/2026-19:09:49] [TRT] [W] Using default stream in enqueueV3() may lead to performance issues due to additional calls to cudaStreamSynchronize() by TensorRT to ensure correct synchronization. Please use non-default stream instead.
|
||||||
|
|
||||||
|
9%|▉ | 1/11 [00:16<02:45, 16.53s/it]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
5
unitree_g1_pack_camera/case3/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_g1_pack_camera/case3/unitree_g1_pack_camera_case3.mp4",
|
||||||
|
"pred_video": "unitree_g1_pack_camera/case3/output/inference/100_full_fs6.mp4",
|
||||||
|
"psnr": 37.034952654534486
|
||||||
|
}
|
||||||
24
unitree_g1_pack_camera/case3/run_world_model_interaction.sh
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_g1_pack_camera/case3"
|
||||||
|
dataset="unitree_g1_pack_camera"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_g1_pack_camera/case3/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 6 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 11 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 190 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
100,x,x,unitree_g1_pack_camera,mount camera,x,x,x,G1_Dex1,30
|
||||||
|
5
unitree_g1_pack_camera/case4/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_g1_pack_camera/case4/unitree_g1_pack_camera_case4.mp4",
|
||||||
|
"pred_video": "unitree_g1_pack_camera/case4/output/inference/200_full_fs6.mp4",
|
||||||
|
"psnr": 31.43390896360405
|
||||||
|
}
|
||||||
24
unitree_g1_pack_camera/case4/run_world_model_interaction.sh
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_g1_pack_camera/case4"
|
||||||
|
dataset="unitree_g1_pack_camera"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_g1_pack_camera/case4/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 6 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 11 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 221 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
200,x,x,unitree_g1_pack_camera,mount camera,x,x,x,G1_Dex1,30
|
||||||
|
121
unitree_z1_dual_arm_cleanup_pencils/case1/output.log
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
2026-02-10 15:38:28.973314: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
|
||||||
|
2026-02-10 15:38:29.023024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||||
|
2026-02-10 15:38:29.023070: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||||
|
2026-02-10 15:38:29.024393: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
|
||||||
|
2026-02-10 15:38:29.031901: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||||
|
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||||
|
2026-02-10 15:38:29.955454: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||||
|
Global seed set to 123
|
||||||
|
INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode
|
||||||
|
INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
|
||||||
|
INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
|
||||||
|
AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.
|
||||||
|
INFO:root:Loaded ViT-H-14 model config.
|
||||||
|
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): hf-mirror.com:443
|
||||||
|
DEBUG:urllib3.connectionpool:https://hf-mirror.com:443 "HEAD /laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin HTTP/1.1" 302 0
|
||||||
|
INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
|
||||||
|
INFO:root:Loaded ViT-H-14 model config.
|
||||||
|
DEBUG:urllib3.connectionpool:https://hf-mirror.com:443 "HEAD /laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin HTTP/1.1" 302 0
|
||||||
|
INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
|
||||||
|
>>> model checkpoint loaded.
|
||||||
|
>>> Load pre-trained model ...
|
||||||
|
INFO:root:***** Configing Data *****
|
||||||
|
>>> unitree_z1_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
|
||||||
|
>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
|
||||||
|
>>> unitree_g1_pack_camera: 1 data samples loaded.
|
||||||
|
>>> unitree_g1_pack_camera: data stats loaded.
|
||||||
|
>>> unitree_g1_pack_camera: normalizer initiated.
|
||||||
|
>>> Dataset is successfully loaded ...
|
||||||
|
>>> Generate 16 frames under each generation ...
|
||||||
|
DEBUG:h5py._conv:Creating converter from 3 to 5
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
|
||||||
|
DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096
|
||||||
|
|
||||||
|
0%| | 0/8 [00:00<?, ?it/s]>>> Step 0: generating actions ...
|
||||||
|
>>> Step 0: interacting with world model ...
|
||||||
|
DEBUG:PIL.Image:Importing BlpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BmpImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing BufrStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing CurImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing DdsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing EpsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FitsStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FliImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing FpxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import FpxImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing FtexImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GbrImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GifImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing GribStubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Hdf5StubImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcnsImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IcoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing ImtImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing IptcImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing JpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing Jpeg2KImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing McIdasImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MicImagePlugin
|
||||||
|
DEBUG:PIL.Image:Image: failed to import MicImagePlugin: No module named 'olefile'
|
||||||
|
DEBUG:PIL.Image:Importing MpegImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MpoImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing MspImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PalmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PcxImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PdfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PixarImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PngImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing PsdImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing QoiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SgiImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SpiderImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing SunImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TgaImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing TiffImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WebPImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing WmfImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XbmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XpmImagePlugin
|
||||||
|
DEBUG:PIL.Image:Importing XVThumbImagePlugin
|
||||||
|
|
||||||
|
12%|█▎ | 1/8 [01:14<08:41, 74.51s/it]
|
||||||
|
25%|██▌ | 2/8 [02:29<07:28, 74.79s/it]
|
||||||
|
38%|███▊ | 3/8 [03:44<06:14, 74.81s/it]
|
||||||
|
50%|█████ | 4/8 [04:59<04:59, 74.78s/it]
|
||||||
|
62%|██████▎ | 5/8 [06:13<03:44, 74.73s/it]
|
||||||
|
75%|███████▌ | 6/8 [07:28<02:29, 74.66s/it]
|
||||||
|
88%|████████▊ | 7/8 [08:42<01:14, 74.56s/it]
|
||||||
|
100%|██████████| 8/8 [09:56<00:00, 74.51s/it]
|
||||||
|
100%|██████████| 8/8 [09:56<00:00, 74.62s/it]
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 1: generating actions ...
|
||||||
|
>>> Step 1: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 2: generating actions ...
|
||||||
|
>>> Step 2: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 3: generating actions ...
|
||||||
|
>>> Step 3: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 4: generating actions ...
|
||||||
|
>>> Step 4: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
|
>>> Step 5: generating actions ...
|
||||||
|
>>> Step 5: interacting with world model ...
|
||||||
|
>>>>>>>>>>>>>>>>>>>>>>>>
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_cleanup_pencils/case1/unitree_z1_dual_arm_cleanup_pencils_case1.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
|
||||||
|
"psnr": 47.911564449209735
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_cleanup_pencils/case1"
|
||||||
|
dataset="unitree_z1_dual_arm_cleanup_pencils"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_cleanup_pencils/case1/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 8 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 212 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
0,x,x,unitree_z1_dual_arm_cleanup_pencils,clean up eraser and pencils,x,x,x,Z1_Dual_Dex1,30
|
||||||
|
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_cleanup_pencils/case2/unitree_z1_dual_arm_cleanup_pencils_case2.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_cleanup_pencils/case2/output/inference/50_full_fs4.mp4",
|
||||||
|
"psnr": 48.344571927558974
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_cleanup_pencils/case2"
|
||||||
|
dataset="unitree_z1_dual_arm_cleanup_pencils"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_cleanup_pencils/case2/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 8 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 202 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
50,x,x,unitree_z1_dual_arm_cleanup_pencils,clean up eraser and pencils,x,x,x,Z1_Dual_Dex1,30
|
||||||
|
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_cleanup_pencils/case3/unitree_z1_dual_arm_cleanup_pencils_case3.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_cleanup_pencils/case3/output/inference/100_full_fs4.mp4",
|
||||||
|
"psnr": 41.152374490134825
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_cleanup_pencils/case3"
|
||||||
|
dataset="unitree_z1_dual_arm_cleanup_pencils"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_cleanup_pencils/case3/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 8 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 183 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
100,x,x,unitree_z1_dual_arm_cleanup_pencils,clean up eraser and pencils,x,x,x,Z1_Dual_Dex1,30
|
||||||
|
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_cleanup_pencils/case4/unitree_z1_dual_arm_cleanup_pencils_case4.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_cleanup_pencils/case4/output/inference/200_full_fs4.mp4",
|
||||||
|
"psnr": 46.025723557253855
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_cleanup_pencils/case4"
|
||||||
|
dataset="unitree_z1_dual_arm_cleanup_pencils"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_cleanup_pencils/case4/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 8 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 174 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
200,x,x,unitree_z1_dual_arm_cleanup_pencils,clean up eraser and pencils,x,x,x,Z1_Dual_Dex1,30
|
||||||
|
5
unitree_z1_dual_arm_stackbox/case1/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_stackbox/case1/unitree_z1_dual_arm_stackbox_case1.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_stackbox/case1/output/inference/5_full_fs4.mp4",
|
||||||
|
"psnr": 44.3480149502738
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_stackbox/case1"
|
||||||
|
dataset="unitree_z1_dual_arm_stackbox"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_stackbox/case1/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 7 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 272 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
5,x,x,unitree_z1_dual_arm_stackbox,"Pick up the red cup on the table.",x,x,x,Unitree Z1 Robot Dual-Arm,30
|
||||||
|
5
unitree_z1_dual_arm_stackbox/case2/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_stackbox/case2/unitree_z1_dual_arm_stackbox_case2.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_stackbox/case2/output/inference/15_full_fs4.mp4",
|
||||||
|
"psnr": 39.867728254007716
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_stackbox/case2"
|
||||||
|
dataset="unitree_z1_dual_arm_stackbox"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_stackbox/case2/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 7 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 268 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
15,x,x,unitree_z1_dual_arm_stackbox,"Pick up the red cup on the table.",x,x,x,Unitree Z1 Robot Dual-Arm,30
|
||||||
|
5
unitree_z1_dual_arm_stackbox/case3/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_stackbox/case3/unitree_z1_dual_arm_stackbox_case3.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_stackbox/case3/output/inference/25_full_fs4.mp4",
|
||||||
|
"psnr": 39.19101039445159
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_stackbox/case3"
|
||||||
|
dataset="unitree_z1_dual_arm_stackbox"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_stackbox/case3/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 7 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 267 KiB |
@@ -0,0 +1,2 @@
|
|||||||
|
videoid,contentUrl,duration,data_dir,instruction,dynamic_confidence,dynamic_wording,dynamic_source_category,embodiment,fps
|
||||||
|
25,x,x,unitree_z1_dual_arm_stackbox,"Pick up the red cup on the table.",x,x,x,Unitree Z1 Robot Dual-Arm,30
|
||||||
|
5
unitree_z1_dual_arm_stackbox/case4/psnr_result.json
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"gt_video": "unitree_z1_dual_arm_stackbox/case4/unitree_z1_dual_arm_stackbox_case4.mp4",
|
||||||
|
"pred_video": "unitree_z1_dual_arm_stackbox/case4/output/inference/35_full_fs4.mp4",
|
||||||
|
"psnr": 40.29563315341769
|
||||||
|
}
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
res_dir="unitree_z1_dual_arm_stackbox/case4"
|
||||||
|
dataset="unitree_z1_dual_arm_stackbox"
|
||||||
|
|
||||||
|
{
|
||||||
|
time CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/world_model_interaction.py \
|
||||||
|
--seed 123 \
|
||||||
|
--ckpt_path ckpts/unifolm_wma_dual.ckpt \
|
||||||
|
--config configs/inference/world_model_interaction.yaml \
|
||||||
|
--savedir "${res_dir}/output" \
|
||||||
|
--bs 1 --height 320 --width 512 \
|
||||||
|
--unconditional_guidance_scale 1.0 \
|
||||||
|
--ddim_steps 50 \
|
||||||
|
--ddim_eta 1.0 \
|
||||||
|
--prompt_dir "unitree_z1_dual_arm_stackbox/case4/world_model_interaction_prompts" \
|
||||||
|
--dataset ${dataset} \
|
||||||
|
--video_length 16 \
|
||||||
|
--frame_stride 4 \
|
||||||
|
--n_action_steps 16 \
|
||||||
|
--exe_steps 16 \
|
||||||
|
--n_iter 7 \
|
||||||
|
--timestep_spacing 'uniform_trailing' \
|
||||||
|
--guidance_rescale 0.7 \
|
||||||
|
--perframe_ae
|
||||||
|
} 2>&1 | tee "${res_dir}/output.log"
|
||||||
|
After Width: | Height: | Size: 280 KiB |