理论性能分析
This commit is contained in:
@@ -7,79 +7,97 @@ MACRO-LEVEL TIMING SUMMARY
|
||||
----------------------------------------
|
||||
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
|
||||
--------------------------------------------------------------------------------------
|
||||
action_generation 11 399707.47 36337.04 36336.85
|
||||
data_loading 1 52.85 52.85 52.88
|
||||
get_latent_z/encode 22 901.39 40.97 41.01
|
||||
iteration_total 11 836793.23 76072.11 76071.63
|
||||
load_transitions 1 2.24 2.24 2.28
|
||||
model_loading/checkpoint 1 11833.31 11833.31 11833.43
|
||||
model_loading/config 1 49774.19 49774.19 49774.16
|
||||
model_to_cuda 1 8909.30 8909.30 8909.33
|
||||
prepare_init_input 1 10.52 10.52 10.55
|
||||
prepare_observation 11 5.41 0.49 0.53
|
||||
prepare_wm_observation 11 2.12 0.19 0.22
|
||||
save_results 11 38668.06 3515.28 3515.32
|
||||
synthesis/conditioning_prep 22 2916.63 132.57 132.61
|
||||
synthesis/ddim_sampling 22 782695.01 35577.05 35576.86
|
||||
synthesis/decode_first_stage 22 12444.31 565.65 565.70
|
||||
update_action_queues 11 6.85 0.62 0.65
|
||||
update_state_queues 11 17.67 1.61 1.64
|
||||
world_model_interaction 11 398375.58 36215.96 36215.75
|
||||
action_generation 11 173133.54 15739.41 15739.36
|
||||
data_loading 1 54.31 54.31 54.34
|
||||
get_latent_z/encode 22 785.25 35.69 35.72
|
||||
iteration_total 11 386482.08 35134.73 35134.55
|
||||
load_transitions 1 2.07 2.07 2.10
|
||||
model_loading/prepared 1 4749.22 4749.22 4749.83
|
||||
prepare_init_input 1 29.19 29.19 29.22
|
||||
prepare_observation 11 5.49 0.50 0.53
|
||||
prepare_wm_observation 11 1.93 0.18 0.20
|
||||
save_results 11 38791.18 3526.47 3526.51
|
||||
synthesis/conditioning_prep 22 2528.23 114.92 114.95
|
||||
synthesis/ddim_sampling 22 336003.29 15272.88 15272.83
|
||||
synthesis/decode_first_stage 22 9095.14 413.42 413.46
|
||||
update_action_queues 11 7.28 0.66 0.69
|
||||
update_state_queues 11 17.38 1.58 1.61
|
||||
world_model_interaction 11 174516.52 15865.14 15865.07
|
||||
--------------------------------------------------------------------------------------
|
||||
TOTAL 2543116.13
|
||||
TOTAL 1126202.08
|
||||
|
||||
----------------------------------------
|
||||
GPU MEMORY SUMMARY
|
||||
----------------------------------------
|
||||
Peak allocated: 17890.50 MB
|
||||
Average allocated: 16129.98 MB
|
||||
Peak allocated: 18188.29 MB
|
||||
Average allocated: 9117.49 MB
|
||||
|
||||
----------------------------------------
|
||||
TOP 30 OPERATORS BY CUDA TIME
|
||||
----------------------------------------
|
||||
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
|
||||
------------------------------------------------------------------------------------------------
|
||||
ProfilerStep* 6 443804.16 237696.98 237689.25
|
||||
aten::linear 171276 112286.23 13179.82 0.00
|
||||
aten::addmm 81456 79537.36 3799.84 79296.37
|
||||
ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10
|
||||
aten::matmul 90468 34234.05 6281.32 0.00
|
||||
aten::_convolution 100242 33623.79 13105.89 0.00
|
||||
aten::mm 89820 33580.74 3202.22 33253.18
|
||||
aten::convolution 100242 33575.23 13714.47 0.00
|
||||
aten::cudnn_convolution 98430 30932.19 8640.50 29248.12
|
||||
ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52
|
||||
aten::conv2d 42042 18115.35 5932.30 0.00
|
||||
ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81
|
||||
xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44
|
||||
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31
|
||||
ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12
|
||||
aten::copy_ 231819 14504.87 127056.51 14038.39
|
||||
aten::group_norm 87144 12033.73 10659.57 0.00
|
||||
aten::native_group_norm 87144 11473.40 9449.36 11002.02
|
||||
aten::conv3d 26400 8852.13 3365.43 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68
|
||||
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8525.44 0.00 8525.44
|
||||
aten::clone 214314 8200.26 8568.82 0.00
|
||||
void at::native::elementwise_kernel<128, 2, at:... 220440 8109.62 0.00 8109.62
|
||||
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7919.30 0.00 7919.30
|
||||
aten::_to_copy 12219 5963.43 122411.53 0.00
|
||||
aten::to 58101 5952.65 122443.72 0.00
|
||||
aten::conv1d 30000 5878.95 4556.48 0.00
|
||||
Memcpy HtoD (Pageable -> Device) 6696 5856.39 0.00 5856.39
|
||||
aten::reshape 671772 5124.03 9636.01 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70
|
||||
ProfilerStep* 18 690146.23 133688.74 616385.44
|
||||
aten::group_norm 168624 24697.84 29217.27 0.00
|
||||
aten::_convolution 96450 21420.26 12845.86 0.00
|
||||
aten::convolution 96450 21408.68 13480.97 0.00
|
||||
aten::linear 297398 20780.15 26257.38 0.00
|
||||
aten::cudnn_convolution 94638 18660.24 8239.04 18329.28
|
||||
aten::copy_ 772677 18135.46 17387.09 17864.87
|
||||
aten::conv3d 52800 12922.42 8572.58 0.00
|
||||
aten::conv2d 52469 12747.13 7725.70 0.00
|
||||
aten::native_group_norm 84312 10285.37 8974.31 10197.66
|
||||
aten::_to_copy 590277 10270.09 22570.90 0.00
|
||||
aten::to 602979 9655.26 23666.06 0.00
|
||||
aten::conv1d 56245 8174.37 10015.24 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 84312 7979.71 0.00 7979.71
|
||||
aten::clone 177132 7502.90 7007.48 0.00
|
||||
void cudnn::ops::nchwToNhwcKernel<__nv_bfloat16... 164700 7384.52 0.00 7384.52
|
||||
aten::addmm 81456 6958.44 3903.01 6908.44
|
||||
aten::layer_norm 65700 5698.92 7816.08 0.00
|
||||
void at::native::elementwise_kernel<128, 4, at:... 149688 5372.46 0.00 5372.46
|
||||
void at::native::unrolled_elementwise_kernel<at... 180120 5165.28 0.00 5165.28
|
||||
ampere_bf16_s16816gemm_bf16_128x128_ldg8_relu_f... 24900 4449.05 0.00 4449.05
|
||||
void at::native::unrolled_elementwise_kernel<at... 368664 4405.30 0.00 4405.30
|
||||
aten::reshape 686778 3771.84 8309.51 0.00
|
||||
aten::contiguous 46008 3400.88 1881.73 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_bf16bf16_bf16f32_... 15516 3398.03 0.00 3398.03
|
||||
aten::matmul 90489 3366.62 4946.69 0.00
|
||||
aten::mm 89820 3284.53 3308.76 3228.56
|
||||
void at::native::elementwise_kernel<128, 2, at:... 46518 2441.55 0.00 2441.55
|
||||
aten::add 113118 2426.66 2776.23 2385.52
|
||||
void at::native::elementwise_kernel<128, 4, at:... 104550 2426.41 0.00 2426.41
|
||||
|
||||
----------------------------------------
|
||||
OPERATOR CATEGORY BREAKDOWN
|
||||
----------------------------------------
|
||||
Category CUDA Time(ms) Percentage
|
||||
---------------------------------------------------------
|
||||
Other 481950.47 41.9%
|
||||
Linear/GEMM 342333.09 29.8%
|
||||
Convolution 159920.77 13.9%
|
||||
Elementwise 54682.93 4.8%
|
||||
Memory 36883.36 3.2%
|
||||
Attention 34736.13 3.0%
|
||||
Normalization 32081.19 2.8%
|
||||
Activation 6449.19 0.6%
|
||||
Other 723472.91 71.9%
|
||||
Convolution 114469.81 11.4%
|
||||
Memory 53845.46 5.4%
|
||||
Normalization 46852.57 4.7%
|
||||
Linear/GEMM 35354.58 3.5%
|
||||
Elementwise 17078.44 1.7%
|
||||
Activation 12296.29 1.2%
|
||||
Attention 2956.61 0.3%
|
||||
|
||||
------------------------------------------------------------------------------------------
|
||||
aten::addmm (Linear/GEMM) UTILIZATION ANALYSIS ON A100
|
||||
------------------------------------------------------------------------------------------
|
||||
Effective compute precision: BF16 Tensor Core (312 TFLOPS)
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
Metric Value
|
||||
-------------------------------------------------------------------
|
||||
Total aten::addmm calls 81,456
|
||||
Total Self CUDA time 6908.44 ms
|
||||
Total FLOPs (profiler) 1.33 PFLOPS
|
||||
Achieved throughput 191.88 TFLOPS/s
|
||||
A100 peak throughput 312.00 TFLOPS/s
|
||||
MFU (Model FLOPs Utilization) 61.50%
|
||||
|
||||
INTERPRETATION:
|
||||
-------------------------------------------------------------------
|
||||
Good utilization (>60%). GEMM kernels are compute-bound
|
||||
and running efficiently on Tensor Cores.
|
||||
|
||||
Reference in New Issue
Block a user