性能剖析
This commit is contained in:
@@ -0,0 +1,85 @@
|
||||
================================================================================
|
||||
PERFORMANCE PROFILING REPORT
|
||||
================================================================================
|
||||
|
||||
----------------------------------------
|
||||
MACRO-LEVEL TIMING SUMMARY
|
||||
----------------------------------------
|
||||
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
|
||||
--------------------------------------------------------------------------------------
|
||||
action_generation 11 399707.47 36337.04 36336.85
|
||||
data_loading 1 52.85 52.85 52.88
|
||||
get_latent_z/encode 22 901.39 40.97 41.01
|
||||
iteration_total 11 836793.23 76072.11 76071.63
|
||||
load_transitions 1 2.24 2.24 2.28
|
||||
model_loading/checkpoint 1 11833.31 11833.31 11833.43
|
||||
model_loading/config 1 49774.19 49774.19 49774.16
|
||||
model_to_cuda 1 8909.30 8909.30 8909.33
|
||||
prepare_init_input 1 10.52 10.52 10.55
|
||||
prepare_observation 11 5.41 0.49 0.53
|
||||
prepare_wm_observation 11 2.12 0.19 0.22
|
||||
save_results 11 38668.06 3515.28 3515.32
|
||||
synthesis/conditioning_prep 22 2916.63 132.57 132.61
|
||||
synthesis/ddim_sampling 22 782695.01 35577.05 35576.86
|
||||
synthesis/decode_first_stage 22 12444.31 565.65 565.70
|
||||
update_action_queues 11 6.85 0.62 0.65
|
||||
update_state_queues 11 17.67 1.61 1.64
|
||||
world_model_interaction 11 398375.58 36215.96 36215.75
|
||||
--------------------------------------------------------------------------------------
|
||||
TOTAL 2543116.13
|
||||
|
||||
----------------------------------------
|
||||
GPU MEMORY SUMMARY
|
||||
----------------------------------------
|
||||
Peak allocated: 17890.50 MB
|
||||
Average allocated: 16129.98 MB
|
||||
|
||||
----------------------------------------
|
||||
TOP 30 OPERATORS BY CUDA TIME
|
||||
----------------------------------------
|
||||
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
|
||||
------------------------------------------------------------------------------------------------
|
||||
ProfilerStep* 6 443804.16 237696.98 237689.25
|
||||
aten::linear 171276 112286.23 13179.82 0.00
|
||||
aten::addmm 81456 79537.36 3799.84 79296.37
|
||||
ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10
|
||||
aten::matmul 90468 34234.05 6281.32 0.00
|
||||
aten::_convolution 100242 33623.79 13105.89 0.00
|
||||
aten::mm 89820 33580.74 3202.22 33253.18
|
||||
aten::convolution 100242 33575.23 13714.47 0.00
|
||||
aten::cudnn_convolution 98430 30932.19 8640.50 29248.12
|
||||
ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52
|
||||
aten::conv2d 42042 18115.35 5932.30 0.00
|
||||
ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81
|
||||
xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44
|
||||
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31
|
||||
ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12
|
||||
aten::copy_ 231819 14504.87 127056.51 14038.39
|
||||
aten::group_norm 87144 12033.73 10659.57 0.00
|
||||
aten::native_group_norm 87144 11473.40 9449.36 11002.02
|
||||
aten::conv3d 26400 8852.13 3365.43 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68
|
||||
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8525.44 0.00 8525.44
|
||||
aten::clone 214314 8200.26 8568.82 0.00
|
||||
void at::native::elementwise_kernel<128, 2, at:... 220440 8109.62 0.00 8109.62
|
||||
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7919.30 0.00 7919.30
|
||||
aten::_to_copy 12219 5963.43 122411.53 0.00
|
||||
aten::to 58101 5952.65 122443.72 0.00
|
||||
aten::conv1d 30000 5878.95 4556.48 0.00
|
||||
Memcpy HtoD (Pageable -> Device) 6696 5856.39 0.00 5856.39
|
||||
aten::reshape 671772 5124.03 9636.01 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70
|
||||
|
||||
----------------------------------------
|
||||
OPERATOR CATEGORY BREAKDOWN
|
||||
----------------------------------------
|
||||
Category CUDA Time(ms) Percentage
|
||||
---------------------------------------------------------
|
||||
Other 481950.47 41.9%
|
||||
Linear/GEMM 342333.09 29.8%
|
||||
Convolution 159920.77 13.9%
|
||||
Elementwise 54682.93 4.8%
|
||||
Memory 36883.36 3.2%
|
||||
Attention 34736.13 3.0%
|
||||
Normalization 32081.19 2.8%
|
||||
Activation 6449.19 0.6%
|
||||
@@ -0,0 +1,85 @@
|
||||
================================================================================
|
||||
PERFORMANCE PROFILING REPORT
|
||||
================================================================================
|
||||
|
||||
----------------------------------------
|
||||
MACRO-LEVEL TIMING SUMMARY
|
||||
----------------------------------------
|
||||
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
|
||||
--------------------------------------------------------------------------------------
|
||||
action_generation 11 394370.58 35851.87 35851.67
|
||||
data_loading 1 52.00 52.00 52.03
|
||||
get_latent_z/encode 22 899.25 40.88 40.91
|
||||
iteration_total 11 830856.07 75532.37 75531.89
|
||||
load_transitions 1 2.11 2.11 2.16
|
||||
model_loading/checkpoint 1 10410.48 10410.48 10410.60
|
||||
model_loading/config 1 49460.02 49460.02 49460.01
|
||||
model_to_cuda 1 4398.71 4398.71 4398.74
|
||||
prepare_init_input 1 10.26 10.26 10.29
|
||||
prepare_observation 11 5.08 0.46 0.49
|
||||
prepare_wm_observation 11 2.03 0.18 0.21
|
||||
save_results 11 40851.48 3713.77 3713.80
|
||||
synthesis/conditioning_prep 22 2270.48 103.20 103.24
|
||||
synthesis/ddim_sampling 22 775253.03 35238.77 35238.59
|
||||
synthesis/decode_first_stage 22 12416.36 564.38 564.43
|
||||
update_action_queues 11 6.27 0.57 0.60
|
||||
update_state_queues 11 16.57 1.51 1.54
|
||||
world_model_interaction 11 395594.93 35963.18 35962.96
|
||||
--------------------------------------------------------------------------------------
|
||||
TOTAL 2516875.71
|
||||
|
||||
----------------------------------------
|
||||
GPU MEMORY SUMMARY
|
||||
----------------------------------------
|
||||
Peak allocated: 17890.50 MB
|
||||
Average allocated: 16129.98 MB
|
||||
|
||||
----------------------------------------
|
||||
TOP 30 OPERATORS BY CUDA TIME
|
||||
----------------------------------------
|
||||
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
|
||||
------------------------------------------------------------------------------------------------
|
||||
ProfilerStep* 6 438046.75 232814.87 232809.14
|
||||
aten::linear 171276 112786.01 10941.68 0.00
|
||||
aten::addmm 81456 79765.93 3676.25 79525.01
|
||||
ampere_sgemm_128x64_tn 26400 52203.84 0.00 52203.84
|
||||
aten::matmul 90468 34345.67 5341.43 0.00
|
||||
aten::_convolution 100242 33699.82 12792.11 0.00
|
||||
aten::mm 89820 33690.79 3067.07 33361.05
|
||||
aten::convolution 100242 33629.44 13178.80 0.00
|
||||
aten::cudnn_convolution 98430 31003.85 9020.54 29316.78
|
||||
ampere_sgemm_32x128_tn 42348 20439.71 0.00 20439.71
|
||||
aten::conv2d 42042 18256.98 5775.15 0.00
|
||||
ampere_sgemm_128x32_tn 40938 16493.37 0.00 16493.37
|
||||
xformers::efficient_attention_forward_cutlass 24000 15256.14 2372.78 15154.49
|
||||
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15155.37 0.00 15155.37
|
||||
ampere_sgemm_64x64_tn 21000 14660.16 0.00 14660.16
|
||||
aten::copy_ 231819 13133.93 137045.31 12663.88
|
||||
aten::group_norm 87144 12058.55 9417.15 0.00
|
||||
aten::native_group_norm 87144 11497.70 8394.42 11024.58
|
||||
aten::conv3d 26400 8909.30 3210.64 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 87144 8732.10 0.00 8732.10
|
||||
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8550.65 0.00 8550.65
|
||||
aten::clone 214314 8182.15 7704.97 0.00
|
||||
void at::native::elementwise_kernel<128, 2, at:... 220440 8122.53 0.00 8122.53
|
||||
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7959.63 0.00 7959.63
|
||||
aten::conv1d 30000 5921.64 4150.30 0.00
|
||||
aten::reshape 671772 5134.95 7968.26 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5106.25 0.00 5106.25
|
||||
void cutlass_cudnn_infer::Kernel<cutlass_tensor... 4200 4882.51 0.00 4882.51
|
||||
aten::_to_copy 12219 4575.90 132491.24 0.00
|
||||
aten::to 58101 4568.11 132512.86 0.00
|
||||
|
||||
----------------------------------------
|
||||
OPERATOR CATEGORY BREAKDOWN
|
||||
----------------------------------------
|
||||
Category CUDA Time(ms) Percentage
|
||||
---------------------------------------------------------
|
||||
Other 473442.20 41.5%
|
||||
Linear/GEMM 343517.32 30.1%
|
||||
Convolution 160436.45 14.1%
|
||||
Elementwise 54809.55 4.8%
|
||||
Attention 34810.12 3.1%
|
||||
Memory 34401.76 3.0%
|
||||
Normalization 32147.89 2.8%
|
||||
Activation 6457.30 0.6%
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -22,7 +22,8 @@ dataset="unitree_g1_pack_camera"
|
||||
--guidance_rescale 0.7 \
|
||||
--perframe_ae \
|
||||
--profile \
|
||||
--profile_iterations 3
|
||||
--profile_iterations 3 \
|
||||
--profile_detail full
|
||||
} 2>&1 | tee "${res_dir}/output_profile.log"
|
||||
|
||||
echo ""
|
||||
|
||||
Binary file not shown.
Reference in New Issue
Block a user