From ff920b85a2cdcf8408d3ea55bdf6873dd32a37e4 Mon Sep 17 00:00:00 2001 From: qhy <2728290997@qq.com> Date: Tue, 10 Feb 2026 10:10:09 +0800 Subject: [PATCH] =?UTF-8?q?=E7=90=86=E8=AE=BA=E6=80=A7=E8=83=BD=E5=88=86?= =?UTF-8?q?=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../profile_output/profiling_report.txt | 136 ++++++++++-------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt b/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt index 98ac526..24af916 100644 --- a/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt +++ b/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt @@ -7,79 +7,97 @@ MACRO-LEVEL TIMING SUMMARY ---------------------------------------- Section Count Total(ms) Avg(ms) CUDA Avg(ms) -------------------------------------------------------------------------------------- -action_generation 11 399707.47 36337.04 36336.85 -data_loading 1 52.85 52.85 52.88 -get_latent_z/encode 22 901.39 40.97 41.01 -iteration_total 11 836793.23 76072.11 76071.63 -load_transitions 1 2.24 2.24 2.28 -model_loading/checkpoint 1 11833.31 11833.31 11833.43 -model_loading/config 1 49774.19 49774.19 49774.16 -model_to_cuda 1 8909.30 8909.30 8909.33 -prepare_init_input 1 10.52 10.52 10.55 -prepare_observation 11 5.41 0.49 0.53 -prepare_wm_observation 11 2.12 0.19 0.22 -save_results 11 38668.06 3515.28 3515.32 -synthesis/conditioning_prep 22 2916.63 132.57 132.61 -synthesis/ddim_sampling 22 782695.01 35577.05 35576.86 -synthesis/decode_first_stage 22 12444.31 565.65 565.70 -update_action_queues 11 6.85 0.62 0.65 -update_state_queues 11 17.67 1.61 1.64 -world_model_interaction 11 398375.58 36215.96 36215.75 +action_generation 11 173133.54 15739.41 15739.36 +data_loading 1 54.31 54.31 54.34 +get_latent_z/encode 22 785.25 35.69 35.72 +iteration_total 11 386482.08 35134.73 35134.55 +load_transitions 1 2.07 2.07 2.10 +model_loading/prepared 1 4749.22 4749.22 4749.83 +prepare_init_input 1 29.19 29.19 29.22 +prepare_observation 11 5.49 0.50 0.53 +prepare_wm_observation 11 1.93 0.18 0.20 +save_results 11 38791.18 3526.47 3526.51 +synthesis/conditioning_prep 22 2528.23 114.92 114.95 +synthesis/ddim_sampling 22 336003.29 15272.88 15272.83 +synthesis/decode_first_stage 22 9095.14 413.42 413.46 +update_action_queues 11 7.28 0.66 0.69 +update_state_queues 11 17.38 1.58 1.61 +world_model_interaction 11 174516.52 15865.14 15865.07 -------------------------------------------------------------------------------------- -TOTAL 2543116.13 +TOTAL 1126202.08 ---------------------------------------- GPU MEMORY SUMMARY ---------------------------------------- -Peak allocated: 17890.50 MB -Average allocated: 16129.98 MB +Peak allocated: 18188.29 MB +Average allocated: 9117.49 MB ---------------------------------------- TOP 30 OPERATORS BY CUDA TIME ---------------------------------------- Operator Count CUDA(ms) CPU(ms) Self CUDA(ms) ------------------------------------------------------------------------------------------------ -ProfilerStep* 6 443804.16 237696.98 237689.25 -aten::linear 171276 112286.23 13179.82 0.00 -aten::addmm 81456 79537.36 3799.84 79296.37 -ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10 -aten::matmul 90468 34234.05 6281.32 0.00 -aten::_convolution 100242 33623.79 13105.89 0.00 -aten::mm 89820 33580.74 3202.22 33253.18 -aten::convolution 100242 33575.23 13714.47 0.00 -aten::cudnn_convolution 98430 30932.19 8640.50 29248.12 -ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52 -aten::conv2d 42042 18115.35 5932.30 0.00 -ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81 -xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44 -fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31 -ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12 -aten::copy_ 231819 14504.87 127056.51 14038.39 -aten::group_norm 87144 12033.73 10659.57 0.00 -aten::native_group_norm 87144 11473.40 9449.36 11002.02 -aten::conv3d 26400 8852.13 3365.43 0.00 -void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68 -void cudnn::ops::nchwToNhwcKernel Device) 6696 5856.39 0.00 5856.39 -aten::reshape 671772 5124.03 9636.01 0.00 -sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70 +ProfilerStep* 18 690146.23 133688.74 616385.44 +aten::group_norm 168624 24697.84 29217.27 0.00 +aten::_convolution 96450 21420.26 12845.86 0.00 +aten::convolution 96450 21408.68 13480.97 0.00 +aten::linear 297398 20780.15 26257.38 0.00 +aten::cudnn_convolution 94638 18660.24 8239.04 18329.28 +aten::copy_ 772677 18135.46 17387.09 17864.87 +aten::conv3d 52800 12922.42 8572.58 0.00 +aten::conv2d 52469 12747.13 7725.70 0.00 +aten::native_group_norm 84312 10285.37 8974.31 10197.66 +aten::_to_copy 590277 10270.09 22570.90 0.00 +aten::to 602979 9655.26 23666.06 0.00 +aten::conv1d 56245 8174.37 10015.24 0.00 +void at::native::(anonymous namespace)::Rowwise... 84312 7979.71 0.00 7979.71 +aten::clone 177132 7502.90 7007.48 0.00 +void cudnn::ops::nchwToNhwcKernel<__nv_bfloat16... 164700 7384.52 0.00 7384.52 +aten::addmm 81456 6958.44 3903.01 6908.44 +aten::layer_norm 65700 5698.92 7816.08 0.00 +void at::native::elementwise_kernel<128, 4, at:... 149688 5372.46 0.00 5372.46 +void at::native::unrolled_elementwise_kernel60%). GEMM kernels are compute-bound + and running efficiently on Tensor Cores.