性能剖析

This commit is contained in:
2026-01-18 00:31:39 +08:00
parent 25c6fc04db
commit c86c2be5ff
26 changed files with 272 additions and 54 deletions

View File

@@ -0,0 +1,85 @@
================================================================================
PERFORMANCE PROFILING REPORT
================================================================================
----------------------------------------
MACRO-LEVEL TIMING SUMMARY
----------------------------------------
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
--------------------------------------------------------------------------------------
action_generation 11 399707.47 36337.04 36336.85
data_loading 1 52.85 52.85 52.88
get_latent_z/encode 22 901.39 40.97 41.01
iteration_total 11 836793.23 76072.11 76071.63
load_transitions 1 2.24 2.24 2.28
model_loading/checkpoint 1 11833.31 11833.31 11833.43
model_loading/config 1 49774.19 49774.19 49774.16
model_to_cuda 1 8909.30 8909.30 8909.33
prepare_init_input 1 10.52 10.52 10.55
prepare_observation 11 5.41 0.49 0.53
prepare_wm_observation 11 2.12 0.19 0.22
save_results 11 38668.06 3515.28 3515.32
synthesis/conditioning_prep 22 2916.63 132.57 132.61
synthesis/ddim_sampling 22 782695.01 35577.05 35576.86
synthesis/decode_first_stage 22 12444.31 565.65 565.70
update_action_queues 11 6.85 0.62 0.65
update_state_queues 11 17.67 1.61 1.64
world_model_interaction 11 398375.58 36215.96 36215.75
--------------------------------------------------------------------------------------
TOTAL 2543116.13
----------------------------------------
GPU MEMORY SUMMARY
----------------------------------------
Peak allocated: 17890.50 MB
Average allocated: 16129.98 MB
----------------------------------------
TOP 30 OPERATORS BY CUDA TIME
----------------------------------------
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
------------------------------------------------------------------------------------------------
ProfilerStep* 6 443804.16 237696.98 237689.25
aten::linear 171276 112286.23 13179.82 0.00
aten::addmm 81456 79537.36 3799.84 79296.37
ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10
aten::matmul 90468 34234.05 6281.32 0.00
aten::_convolution 100242 33623.79 13105.89 0.00
aten::mm 89820 33580.74 3202.22 33253.18
aten::convolution 100242 33575.23 13714.47 0.00
aten::cudnn_convolution 98430 30932.19 8640.50 29248.12
ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52
aten::conv2d 42042 18115.35 5932.30 0.00
ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81
xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31
ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12
aten::copy_ 231819 14504.87 127056.51 14038.39
aten::group_norm 87144 12033.73 10659.57 0.00
aten::native_group_norm 87144 11473.40 9449.36 11002.02
aten::conv3d 26400 8852.13 3365.43 0.00
void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8525.44 0.00 8525.44
aten::clone 214314 8200.26 8568.82 0.00
void at::native::elementwise_kernel<128, 2, at:... 220440 8109.62 0.00 8109.62
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7919.30 0.00 7919.30
aten::_to_copy 12219 5963.43 122411.53 0.00
aten::to 58101 5952.65 122443.72 0.00
aten::conv1d 30000 5878.95 4556.48 0.00
Memcpy HtoD (Pageable -> Device) 6696 5856.39 0.00 5856.39
aten::reshape 671772 5124.03 9636.01 0.00
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70
----------------------------------------
OPERATOR CATEGORY BREAKDOWN
----------------------------------------
Category CUDA Time(ms) Percentage
---------------------------------------------------------
Other 481950.47 41.9%
Linear/GEMM 342333.09 29.8%
Convolution 159920.77 13.9%
Elementwise 54682.93 4.8%
Memory 36883.36 3.2%
Attention 34736.13 3.0%
Normalization 32081.19 2.8%
Activation 6449.19 0.6%

View File

@@ -0,0 +1,85 @@
================================================================================
PERFORMANCE PROFILING REPORT
================================================================================
----------------------------------------
MACRO-LEVEL TIMING SUMMARY
----------------------------------------
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
--------------------------------------------------------------------------------------
action_generation 11 394370.58 35851.87 35851.67
data_loading 1 52.00 52.00 52.03
get_latent_z/encode 22 899.25 40.88 40.91
iteration_total 11 830856.07 75532.37 75531.89
load_transitions 1 2.11 2.11 2.16
model_loading/checkpoint 1 10410.48 10410.48 10410.60
model_loading/config 1 49460.02 49460.02 49460.01
model_to_cuda 1 4398.71 4398.71 4398.74
prepare_init_input 1 10.26 10.26 10.29
prepare_observation 11 5.08 0.46 0.49
prepare_wm_observation 11 2.03 0.18 0.21
save_results 11 40851.48 3713.77 3713.80
synthesis/conditioning_prep 22 2270.48 103.20 103.24
synthesis/ddim_sampling 22 775253.03 35238.77 35238.59
synthesis/decode_first_stage 22 12416.36 564.38 564.43
update_action_queues 11 6.27 0.57 0.60
update_state_queues 11 16.57 1.51 1.54
world_model_interaction 11 395594.93 35963.18 35962.96
--------------------------------------------------------------------------------------
TOTAL 2516875.71
----------------------------------------
GPU MEMORY SUMMARY
----------------------------------------
Peak allocated: 17890.50 MB
Average allocated: 16129.98 MB
----------------------------------------
TOP 30 OPERATORS BY CUDA TIME
----------------------------------------
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
------------------------------------------------------------------------------------------------
ProfilerStep* 6 438046.75 232814.87 232809.14
aten::linear 171276 112786.01 10941.68 0.00
aten::addmm 81456 79765.93 3676.25 79525.01
ampere_sgemm_128x64_tn 26400 52203.84 0.00 52203.84
aten::matmul 90468 34345.67 5341.43 0.00
aten::_convolution 100242 33699.82 12792.11 0.00
aten::mm 89820 33690.79 3067.07 33361.05
aten::convolution 100242 33629.44 13178.80 0.00
aten::cudnn_convolution 98430 31003.85 9020.54 29316.78
ampere_sgemm_32x128_tn 42348 20439.71 0.00 20439.71
aten::conv2d 42042 18256.98 5775.15 0.00
ampere_sgemm_128x32_tn 40938 16493.37 0.00 16493.37
xformers::efficient_attention_forward_cutlass 24000 15256.14 2372.78 15154.49
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15155.37 0.00 15155.37
ampere_sgemm_64x64_tn 21000 14660.16 0.00 14660.16
aten::copy_ 231819 13133.93 137045.31 12663.88
aten::group_norm 87144 12058.55 9417.15 0.00
aten::native_group_norm 87144 11497.70 8394.42 11024.58
aten::conv3d 26400 8909.30 3210.64 0.00
void at::native::(anonymous namespace)::Rowwise... 87144 8732.10 0.00 8732.10
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8550.65 0.00 8550.65
aten::clone 214314 8182.15 7704.97 0.00
void at::native::elementwise_kernel<128, 2, at:... 220440 8122.53 0.00 8122.53
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7959.63 0.00 7959.63
aten::conv1d 30000 5921.64 4150.30 0.00
aten::reshape 671772 5134.95 7968.26 0.00
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5106.25 0.00 5106.25
void cutlass_cudnn_infer::Kernel<cutlass_tensor... 4200 4882.51 0.00 4882.51
aten::_to_copy 12219 4575.90 132491.24 0.00
aten::to 58101 4568.11 132512.86 0.00
----------------------------------------
OPERATOR CATEGORY BREAKDOWN
----------------------------------------
Category CUDA Time(ms) Percentage
---------------------------------------------------------
Other 473442.20 41.5%
Linear/GEMM 343517.32 30.1%
Convolution 160436.45 14.1%
Elementwise 54809.55 4.8%
Attention 34810.12 3.1%
Memory 34401.76 3.0%
Normalization 32147.89 2.8%
Activation 6457.30 0.6%

View File

@@ -22,7 +22,8 @@ dataset="unitree_g1_pack_camera"
--guidance_rescale 0.7 \
--perframe_ae \
--profile \
--profile_iterations 3
--profile_iterations 3 \
--profile_detail full
} 2>&1 | tee "${res_dir}/output_profile.log"
echo ""