性能剖析

This commit is contained in:
2026-01-18 00:31:39 +08:00
parent 25c6fc04db
commit c86c2be5ff
26 changed files with 272 additions and 54 deletions

View File

@@ -56,21 +56,50 @@ class TimingRecord:
}
class ProfilerManager:
"""Manages macro and micro-level profiling."""
def __init__(self, enabled: bool = False, output_dir: str = "./profile_output"):
self.enabled = enabled
self.output_dir = output_dir
self.macro_timings: Dict[str, List[float]] = {}
self.cuda_events: Dict[str, List[tuple]] = {}
self.memory_snapshots: List[Dict] = []
self.pytorch_profiler = None
self.current_iteration = 0
self.operator_stats: Dict[str, Dict] = {}
if enabled:
os.makedirs(output_dir, exist_ok=True)
class ProfilerManager:
"""Manages macro and micro-level profiling."""
def __init__(
self,
enabled: bool = False,
output_dir: str = "./profile_output",
profile_detail: str = "light",
):
self.enabled = enabled
self.output_dir = output_dir
self.profile_detail = profile_detail
self.macro_timings: Dict[str, List[float]] = {}
self.cuda_events: Dict[str, List[tuple]] = {}
self.memory_snapshots: List[Dict] = []
self.pytorch_profiler = None
self.current_iteration = 0
self.operator_stats: Dict[str, Dict] = {}
self.profiler_config = self._build_profiler_config(profile_detail)
if enabled:
os.makedirs(output_dir, exist_ok=True)
def _build_profiler_config(self, profile_detail: str) -> Dict[str, Any]:
"""Return profiler settings based on the requested detail level."""
if profile_detail not in ("light", "full"):
raise ValueError(f"Unsupported profile_detail: {profile_detail}")
if profile_detail == "full":
return {
"record_shapes": True,
"profile_memory": True,
"with_stack": True,
"with_flops": True,
"with_modules": True,
"group_by_input_shape": True,
}
return {
"record_shapes": False,
"profile_memory": False,
"with_stack": False,
"with_flops": False,
"with_modules": False,
"group_by_input_shape": False,
}
@contextmanager
def profile_section(self, name: str, sync_cuda: bool = True):
@@ -133,22 +162,22 @@ class ProfilerManager:
if not self.enabled:
return nullcontext()
self.pytorch_profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(
wait=wait, warmup=warmup, active=active, repeat=1
),
on_trace_ready=self._trace_handler,
record_shapes=True,
profile_memory=True,
with_stack=True,
with_flops=True,
with_modules=True,
)
return self.pytorch_profiler
self.pytorch_profiler = torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
schedule=torch.profiler.schedule(
wait=wait, warmup=warmup, active=active, repeat=1
),
on_trace_ready=self._trace_handler,
record_shapes=self.profiler_config["record_shapes"],
profile_memory=self.profiler_config["profile_memory"],
with_stack=self.profiler_config["with_stack"],
with_flops=self.profiler_config["with_flops"],
with_modules=self.profiler_config["with_modules"],
)
return self.pytorch_profiler
def _trace_handler(self, prof):
"""Handle profiler trace output."""
@@ -158,8 +187,10 @@ class ProfilerManager:
)
prof.export_chrome_trace(trace_path)
# Extract operator statistics
key_averages = prof.key_averages(group_by_input_shape=True)
# Extract operator statistics
key_averages = prof.key_averages(
group_by_input_shape=self.profiler_config["group_by_input_shape"]
)
for evt in key_averages:
op_name = evt.key
if op_name not in self.operator_stats:
@@ -344,18 +375,22 @@ class ProfilerManager:
# Global profiler instance
_profiler: Optional[ProfilerManager] = None
def get_profiler() -> ProfilerManager:
"""Get the global profiler instance."""
global _profiler
if _profiler is None:
_profiler = ProfilerManager(enabled=False)
return _profiler
def init_profiler(enabled: bool, output_dir: str) -> ProfilerManager:
"""Initialize the global profiler."""
global _profiler
_profiler = ProfilerManager(enabled=enabled, output_dir=output_dir)
return _profiler
def get_profiler() -> ProfilerManager:
"""Get the global profiler instance."""
global _profiler
if _profiler is None:
_profiler = ProfilerManager(enabled=False)
return _profiler
def init_profiler(enabled: bool, output_dir: str, profile_detail: str) -> ProfilerManager:
"""Initialize the global profiler."""
global _profiler
_profiler = ProfilerManager(
enabled=enabled,
output_dir=output_dir,
profile_detail=profile_detail,
)
return _profiler
# ========== Original Functions ==========
@@ -1193,13 +1228,20 @@ def get_parser():
default=None,
help="Directory to save profiling results. Defaults to {savedir}/profile_output."
)
parser.add_argument(
"--profile_iterations",
type=int,
default=3,
help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
)
return parser
parser.add_argument(
"--profile_iterations",
type=int,
default=3,
help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
)
parser.add_argument(
"--profile_detail",
type=str,
choices=["light", "full"],
default="light",
help="Profiling detail level. Use 'full' for shapes/stacks/memory/flops."
)
return parser
if __name__ == '__main__':
@@ -1214,7 +1256,11 @@ if __name__ == '__main__':
profile_output_dir = args.profile_output_dir
if profile_output_dir is None:
profile_output_dir = os.path.join(args.savedir, "profile_output")
init_profiler(enabled=args.profile, output_dir=profile_output_dir)
init_profiler(
enabled=args.profile,
output_dir=profile_output_dir,
profile_detail=args.profile_detail,
)
rank, gpu_num = 0, 1
run_inference(args, gpu_num, rank)

View File

@@ -0,0 +1,85 @@
================================================================================
PERFORMANCE PROFILING REPORT
================================================================================
----------------------------------------
MACRO-LEVEL TIMING SUMMARY
----------------------------------------
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
--------------------------------------------------------------------------------------
action_generation 11 399707.47 36337.04 36336.85
data_loading 1 52.85 52.85 52.88
get_latent_z/encode 22 901.39 40.97 41.01
iteration_total 11 836793.23 76072.11 76071.63
load_transitions 1 2.24 2.24 2.28
model_loading/checkpoint 1 11833.31 11833.31 11833.43
model_loading/config 1 49774.19 49774.19 49774.16
model_to_cuda 1 8909.30 8909.30 8909.33
prepare_init_input 1 10.52 10.52 10.55
prepare_observation 11 5.41 0.49 0.53
prepare_wm_observation 11 2.12 0.19 0.22
save_results 11 38668.06 3515.28 3515.32
synthesis/conditioning_prep 22 2916.63 132.57 132.61
synthesis/ddim_sampling 22 782695.01 35577.05 35576.86
synthesis/decode_first_stage 22 12444.31 565.65 565.70
update_action_queues 11 6.85 0.62 0.65
update_state_queues 11 17.67 1.61 1.64
world_model_interaction 11 398375.58 36215.96 36215.75
--------------------------------------------------------------------------------------
TOTAL 2543116.13
----------------------------------------
GPU MEMORY SUMMARY
----------------------------------------
Peak allocated: 17890.50 MB
Average allocated: 16129.98 MB
----------------------------------------
TOP 30 OPERATORS BY CUDA TIME
----------------------------------------
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
------------------------------------------------------------------------------------------------
ProfilerStep* 6 443804.16 237696.98 237689.25
aten::linear 171276 112286.23 13179.82 0.00
aten::addmm 81456 79537.36 3799.84 79296.37
ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10
aten::matmul 90468 34234.05 6281.32 0.00
aten::_convolution 100242 33623.79 13105.89 0.00
aten::mm 89820 33580.74 3202.22 33253.18
aten::convolution 100242 33575.23 13714.47 0.00
aten::cudnn_convolution 98430 30932.19 8640.50 29248.12
ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52
aten::conv2d 42042 18115.35 5932.30 0.00
ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81
xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31
ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12
aten::copy_ 231819 14504.87 127056.51 14038.39
aten::group_norm 87144 12033.73 10659.57 0.00
aten::native_group_norm 87144 11473.40 9449.36 11002.02
aten::conv3d 26400 8852.13 3365.43 0.00
void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8525.44 0.00 8525.44
aten::clone 214314 8200.26 8568.82 0.00
void at::native::elementwise_kernel<128, 2, at:... 220440 8109.62 0.00 8109.62
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7919.30 0.00 7919.30
aten::_to_copy 12219 5963.43 122411.53 0.00
aten::to 58101 5952.65 122443.72 0.00
aten::conv1d 30000 5878.95 4556.48 0.00
Memcpy HtoD (Pageable -> Device) 6696 5856.39 0.00 5856.39
aten::reshape 671772 5124.03 9636.01 0.00
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70
----------------------------------------
OPERATOR CATEGORY BREAKDOWN
----------------------------------------
Category CUDA Time(ms) Percentage
---------------------------------------------------------
Other 481950.47 41.9%
Linear/GEMM 342333.09 29.8%
Convolution 159920.77 13.9%
Elementwise 54682.93 4.8%
Memory 36883.36 3.2%
Attention 34736.13 3.0%
Normalization 32081.19 2.8%
Activation 6449.19 0.6%

View File

@@ -0,0 +1,85 @@
================================================================================
PERFORMANCE PROFILING REPORT
================================================================================
----------------------------------------
MACRO-LEVEL TIMING SUMMARY
----------------------------------------
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
--------------------------------------------------------------------------------------
action_generation 11 394370.58 35851.87 35851.67
data_loading 1 52.00 52.00 52.03
get_latent_z/encode 22 899.25 40.88 40.91
iteration_total 11 830856.07 75532.37 75531.89
load_transitions 1 2.11 2.11 2.16
model_loading/checkpoint 1 10410.48 10410.48 10410.60
model_loading/config 1 49460.02 49460.02 49460.01
model_to_cuda 1 4398.71 4398.71 4398.74
prepare_init_input 1 10.26 10.26 10.29
prepare_observation 11 5.08 0.46 0.49
prepare_wm_observation 11 2.03 0.18 0.21
save_results 11 40851.48 3713.77 3713.80
synthesis/conditioning_prep 22 2270.48 103.20 103.24
synthesis/ddim_sampling 22 775253.03 35238.77 35238.59
synthesis/decode_first_stage 22 12416.36 564.38 564.43
update_action_queues 11 6.27 0.57 0.60
update_state_queues 11 16.57 1.51 1.54
world_model_interaction 11 395594.93 35963.18 35962.96
--------------------------------------------------------------------------------------
TOTAL 2516875.71
----------------------------------------
GPU MEMORY SUMMARY
----------------------------------------
Peak allocated: 17890.50 MB
Average allocated: 16129.98 MB
----------------------------------------
TOP 30 OPERATORS BY CUDA TIME
----------------------------------------
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
------------------------------------------------------------------------------------------------
ProfilerStep* 6 438046.75 232814.87 232809.14
aten::linear 171276 112786.01 10941.68 0.00
aten::addmm 81456 79765.93 3676.25 79525.01
ampere_sgemm_128x64_tn 26400 52203.84 0.00 52203.84
aten::matmul 90468 34345.67 5341.43 0.00
aten::_convolution 100242 33699.82 12792.11 0.00
aten::mm 89820 33690.79 3067.07 33361.05
aten::convolution 100242 33629.44 13178.80 0.00
aten::cudnn_convolution 98430 31003.85 9020.54 29316.78
ampere_sgemm_32x128_tn 42348 20439.71 0.00 20439.71
aten::conv2d 42042 18256.98 5775.15 0.00
ampere_sgemm_128x32_tn 40938 16493.37 0.00 16493.37
xformers::efficient_attention_forward_cutlass 24000 15256.14 2372.78 15154.49
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15155.37 0.00 15155.37
ampere_sgemm_64x64_tn 21000 14660.16 0.00 14660.16
aten::copy_ 231819 13133.93 137045.31 12663.88
aten::group_norm 87144 12058.55 9417.15 0.00
aten::native_group_norm 87144 11497.70 8394.42 11024.58
aten::conv3d 26400 8909.30 3210.64 0.00
void at::native::(anonymous namespace)::Rowwise... 87144 8732.10 0.00 8732.10
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8550.65 0.00 8550.65
aten::clone 214314 8182.15 7704.97 0.00
void at::native::elementwise_kernel<128, 2, at:... 220440 8122.53 0.00 8122.53
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7959.63 0.00 7959.63
aten::conv1d 30000 5921.64 4150.30 0.00
aten::reshape 671772 5134.95 7968.26 0.00
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5106.25 0.00 5106.25
void cutlass_cudnn_infer::Kernel<cutlass_tensor... 4200 4882.51 0.00 4882.51
aten::_to_copy 12219 4575.90 132491.24 0.00
aten::to 58101 4568.11 132512.86 0.00
----------------------------------------
OPERATOR CATEGORY BREAKDOWN
----------------------------------------
Category CUDA Time(ms) Percentage
---------------------------------------------------------
Other 473442.20 41.5%
Linear/GEMM 343517.32 30.1%
Convolution 160436.45 14.1%
Elementwise 54809.55 4.8%
Attention 34810.12 3.1%
Memory 34401.76 3.0%
Normalization 32147.89 2.8%
Activation 6457.30 0.6%

View File

@@ -22,7 +22,8 @@ dataset="unitree_g1_pack_camera"
--guidance_rescale 0.7 \
--perframe_ae \
--profile \
--profile_iterations 3
--profile_iterations 3 \
--profile_detail full
} 2>&1 | tee "${res_dir}/output_profile.log"
echo ""

1
usefal.sh Normal file
View File

@@ -0,0 +1 @@
python3 psnr_score_for_challenge.py --gt_video unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4 --pred_video unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4 --output_file unitree_g1_pack_camera/case1/psnr_result.json