From 6410c62e3e56ac2c800e4d9d5652e31e99563ec9 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 13 Apr 2026 14:50:55 +0800 Subject: [PATCH] Add fine-grained step timing and trim BH RHS overhead (cherry picked from commit 968522995b1b14f09c92a39830019e894a19f1ca) --- AMSS_NCKU_source/bssn_class.C | 168 ++++++++++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_class.h | 6 +- AMSS_NCKU_source/macrodef.h | 15 +++ generate_macrodef.py | 37 ++++++++ 4 files changed, 223 insertions(+), 3 deletions(-) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 1e9c79d..d134bd4 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -51,6 +51,112 @@ using namespace std; #define BSSN_ENABLE_MEM_USAGE_LOG 0 #endif +#ifndef BSSN_FINE_TIMING +#define BSSN_FINE_TIMING 0 +#endif + +#ifndef BSSN_FINE_TIMING_EVERY +#define BSSN_FINE_TIMING_EVERY 1 +#endif + +#ifndef BSSN_FINE_TIMING_TOPN +#define BSSN_FINE_TIMING_TOPN 8 +#endif + +#if BSSN_FINE_TIMING +namespace step_timing +{ + enum Bucket + { + TB_ANALYSIS_PSI4 = 0, + TB_ANALYSIS_SURFACE, + TB_ANALYSIS_IO, + TB_BH_PREDICTOR, + TB_PREDICTOR_RHS, + TB_PREDICTOR_SYNC, + TB_BH_CORRECTOR, + TB_CORRECTOR_RHS, + TB_CORRECTOR_SYNC, + TB_STATE_SWAP, + TB_RESTRICT_PROLONG, + TB_CONSTRAINT_OUT, + TB_DUMP_3D, + TB_DUMP_2D, + TB_CHECKPOINT, + TB_REGRID, + TB_COUNT + }; + + static double local_bucket_seconds[TB_COUNT]; + + static const char *bucket_labels[TB_COUNT] = + { + "analysis_psi4", + "analysis_surface", + "analysis_io", + "bh_predictor", + "predictor_rhs", + "predictor_sync", + "bh_corrector", + "corrector_rhs", + "corrector_sync", + "state_swap", + "restrict_prolong", + "constraint_out", + "dump_3d", + "dump_2d", + "checkpoint", + "regrid" + }; + + void reset() + { + for (int i = 0; i < TB_COUNT; i++) + local_bucket_seconds[i] = 0.0; + } + + void add(Bucket bucket, double seconds) + { + local_bucket_seconds[int(bucket)] += seconds; + } + + void report(int myrank, int nprocs, monitor *TimingMonitor, + int step_index, double phys_time, double step_wall_seconds) + { + double max_bucket_seconds[TB_COUNT]; + double avg_bucket_seconds[TB_COUNT]; + + MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (myrank != 0) + return; + + for (int i = 0; i < TB_COUNT; i++) + avg_bucket_seconds[i] /= Mymax(1, nprocs); + + if (TimingMonitor) + { + double row[2 + 2 * TB_COUNT]; + row[0] = double(step_index); + row[1] = step_wall_seconds; + for (int i = 0; i < TB_COUNT; i++) + { + row[2 + i] = max_bucket_seconds[i]; + row[2 + TB_COUNT + i] = avg_bucket_seconds[i]; + } + TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row); + } + } +} + +#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime() +#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name)) +#else +#define STEP_TIMER_DECL(var_name) +#define STEP_TIMER_ADD(bucket_name, var_name) +#endif + #if USE_CUDA_BSSN namespace { @@ -420,6 +526,24 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei, a_stream.str(""); a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz"; ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str()); + +#if BSSN_FINE_TIMING + a_stream.clear(); + a_stream.str(""); + a_stream << setw(8) << "# step"; + a_stream << setw(14) << "wall"; + for (int ib = 0; ib < step_timing::TB_COUNT; ib++) + a_stream << setw(18) << step_timing::bucket_labels[ib]; + for (int ib = 0; ib < step_timing::TB_COUNT; ib++) + { + char str_avg[64]; + sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]); + a_stream << setw(18) << str_avg; + } + TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str()); +#else + TimingMonitor = 0; +#endif } // setup sphere integration engine Waveshell = new surface_integral(Symmetry); @@ -1368,6 +1492,7 @@ bssn_class::~bssn_class() delete BHMonitor; delete MAPMonitor; delete ConVMonitor; + delete TimingMonitor; delete Waveshell; delete CheckPoint; @@ -2466,6 +2591,10 @@ void bssn_class::Evolve(int Steps) for (int ncount = 1; ncount < Steps + 1; ncount++) { cuda_level0_constraint_cache_valid = false; +#if BSSN_FINE_TIMING + step_timing::reset(); + STEP_TIMER_DECL(step_wall_start); +#endif // special for large mass ratio consideration // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) @@ -2484,7 +2613,9 @@ void bssn_class::Evolve(int Steps) // misc::tillherecheck("before Constraint_Out"); + STEP_TIMER_DECL(timer_constraint_out); Constraint_Out(); // this will affect the Dump_List + STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out); LastDump += dT_mon; Last2dDump += dT_mon; @@ -2493,6 +2624,7 @@ void bssn_class::Evolve(int Steps) // When LastDump >= DumpTime, output corresponding binary data if (LastDump >= DumpTime) { + STEP_TIMER_DECL(timer_dump3d); // misc::tillherecheck("before Dump_Data"); for (int lev = 0; lev < GH->levels; lev++) @@ -2500,6 +2632,7 @@ void bssn_class::Evolve(int Steps) #ifdef WithShell SH->Dump_Data(DumpList, 0, PhysTime, dT_mon); #endif + STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d); LastDump = 0; @@ -2512,10 +2645,12 @@ void bssn_class::Evolve(int Steps) // When Last2dDump >= d2DumpTime, output corresponding 2D data if (Last2dDump >= d2DumpTime) { + STEP_TIMER_DECL(timer_dump2d); // misc::tillherecheck("before 2dDump_Data"); for (int lev = 0; lev < GH->levels; lev++) Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); + STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d); Last2dDump = 0; @@ -2540,10 +2675,12 @@ void bssn_class::Evolve(int Steps) break; #if (REGLEV == 1) + STEP_TIMER_DECL(timer_regrid); GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + STEP_TIMER_ADD(TB_REGRID, timer_regrid); #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2618,6 +2755,7 @@ void bssn_class::Evolve(int Steps) // When LastCheck >= CheckTime, perform runtime checks and output status data if (LastCheck >= CheckTime) { + STEP_TIMER_DECL(timer_checkpoint); LastCheck = 0; CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); @@ -2626,7 +2764,13 @@ void bssn_class::Evolve(int Steps) CheckPoint->writecheck_sh(PhysTime, SH); #endif CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); + STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint); } + +#if BSSN_FINE_TIMING + if (ncount % BSSN_FINE_TIMING_EVERY == 0) + step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start); +#endif } /* #ifdef With_AHF @@ -2758,6 +2902,7 @@ void bssn_class::RecursiveStep(int lev) #endif #if (REGLEV == 0) + STEP_TIMER_DECL(timer_regrid_onelevel); if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -2766,6 +2911,7 @@ void bssn_class::RecursiveStep(int lev) ConstraintRefreshLevels[lev] = 1; for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } } + STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); #endif } @@ -3365,6 +3511,7 @@ void bssn_class::Step(int lev, int YN) // new code 2013-2-15, zjcao #if (MAPBH == 1) + STEP_TIMER_DECL(timer_bh_predictor); // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -3402,6 +3549,7 @@ void bssn_class::Step(int lev, int YN) { AnalysisStuff(lev, dT_lev); } + STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor); #endif #ifdef With_AHF @@ -3418,6 +3566,7 @@ void bssn_class::Step(int lev, int YN) MyList *sPp; // Predictor + STEP_TIMER_DECL(timer_predictor_rhs); MyList *Pp = GH->PatL[lev]; while (Pp) { @@ -3759,6 +3908,7 @@ void bssn_class::Step(int lev, int YN) } #endif } + STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs); // Non-blocking error reduction overlapped with Sync to hide Allreduce latency MPI_Request err_req; @@ -3768,6 +3918,7 @@ void bssn_class::Step(int lev, int YN) } #endif + STEP_TIMER_DECL(timer_predictor_sync); Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); @@ -3789,6 +3940,7 @@ void bssn_class::Step(int lev, int YN) } #endif Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync); #if USE_CUDA_BSSN const bool need_analysis_state_after_predictor = @@ -3856,6 +4008,7 @@ void bssn_class::Step(int lev, int YN) // corrector for (iter_count = 1; iter_count < 4; iter_count++) { + STEP_TIMER_DECL(timer_corrector_rhs); // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; if (iter_count == 1 || iter_count == 3) TRK4 += dT_lev / 2; @@ -4198,7 +4351,9 @@ void bssn_class::Step(int lev, int YN) MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } #endif + STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs); + STEP_TIMER_DECL(timer_corrector_sync); Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); @@ -4220,6 +4375,7 @@ void bssn_class::Step(int lev, int YN) } #endif Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync); #ifdef WithShell // Complete non-blocking error reduction and check @@ -4240,6 +4396,7 @@ void bssn_class::Step(int lev, int YN) #endif #if (MAPBH == 0) + STEP_TIMER_DECL(timer_bh_corrector); // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -4272,11 +4429,13 @@ void bssn_class::Step(int lev, int YN) } } } + STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector); #endif // swap time level if (iter_count < 3) { + STEP_TIMER_DECL(timer_state_swap); Pp = GH->PatL[lev]; while (Pp) { @@ -4323,6 +4482,7 @@ void bssn_class::Step(int lev, int YN) } } #endif + STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap); } } #if USE_CUDA_BSSN @@ -4358,6 +4518,7 @@ void bssn_class::Step(int lev, int YN) // // OldStateList old ----------- // update + STEP_TIMER_DECL(timer_state_commit); Pp = GH->PatL[lev]; while (Pp) { @@ -4404,6 +4565,7 @@ void bssn_class::Step(int lev, int YN) #endif } #endif + STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit); // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -6255,6 +6417,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // // SynchList_cor old ----------- { + STEP_TIMER_DECL(timer_restrict_prolong); #if (PSTR == 1 || PSTR == 2) // stringstream a_stream; // a_stream.setf(ios::left); @@ -6401,6 +6564,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); } //================================================================================================ @@ -6420,6 +6584,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, // // SynchList_cor old ----------- { + STEP_TIMER_DECL(timer_restrict_prolong); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux"); if (lev >= GH->levels - 1) @@ -6492,6 +6657,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); } //================================================================================================ @@ -6502,6 +6668,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, void bssn_class::RestrictProlong(int lev, int YN, bool BB) { + STEP_TIMER_DECL(timer_restrict_prolong); double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); // we assume for fine // SynchList_cor 1 ----------- @@ -6585,6 +6752,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); } //================================================================================================ diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 72b7d2f..f799dc0 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -135,9 +135,9 @@ public: Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong - monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; - monitor *ConVMonitor; - surface_integral *Waveshell; + monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; + monitor *ConVMonitor, *TimingMonitor; + surface_integral *Waveshell; checkpoint *CheckPoint; public: diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h index 631f33b..d529b3a 100644 --- a/AMSS_NCKU_source/macrodef.h +++ b/AMSS_NCKU_source/macrodef.h @@ -29,6 +29,12 @@ #define REGLEV 0 +#define BSSN_FINE_TIMING 1 + +#define BSSN_FINE_TIMING_EVERY 1 + +#define BSSN_FINE_TIMING_TOPN 8 + //#define USE_GPU //#define CHECKDETAIL @@ -88,6 +94,15 @@ // 0: for every level; // 1: for all // +// define BSSN_FINE_TIMING +// enable fine-grained per-timestep timing monitor +// +// define BSSN_FINE_TIMING_EVERY +// report timing every N coarse timesteps +// +// define BSSN_FINE_TIMING_TOPN +// number of hottest timing buckets shown in stdout +// // define USE_GPU // use gpu or not // diff --git a/generate_macrodef.py b/generate_macrodef.py index efa9d72..8911100 100755 --- a/generate_macrodef.py +++ b/generate_macrodef.py @@ -144,6 +144,34 @@ def generate_macrodef_h(): print( "#define REGLEV 0", file=file1 ) print( file=file1 ) + # Define fine-grained timestep timing macros + # These default to enabled profiling without requiring AMSS_NCKU_Input.py edits. + + fine_timing = getattr(input_data, "Fine_Timing", + getattr(input_data, "Finegrained_Timing", "yes")) + timing_report_every = max(1, int(getattr( + input_data, "Timing_Every_Steps", + getattr(input_data, "Timing_Report_Every", 1)))) + timing_top_hotspots = max(1, int(getattr( + input_data, "Timing_Top_Hotspots", 8))) + + if ( fine_timing == "yes" ): + print( "#define BSSN_FINE_TIMING 1", file=file1 ) + print( file=file1 ) + elif ( fine_timing == "no" ): + print( "#define BSSN_FINE_TIMING 0", file=file1 ) + print( file=file1 ) + else: + print( "Fine_Timing setting error!!!" ) + print() + print( "# Fine_Timing setting error!!!", file=file1 ) + print( file=file1 ) + + print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 ) + print( file=file1 ) + print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 ) + print( file=file1 ) + # Define macro USE_GPU # use GPU or not @@ -224,6 +252,15 @@ def generate_macrodef_h(): print( "// 0: for every level;", file=file1 ) print( "// 1: for all", file=file1 ) print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING", file=file1 ) + print( "// enable fine-grained per-timestep timing monitor", file=file1 ) + print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING_EVERY", file=file1 ) + print( "// report timing every N coarse timesteps", file=file1 ) + print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING_TOPN", file=file1 ) + print( "// number of hottest timing buckets shown in stdout", file=file1 ) + print( "//", file=file1 ) print( "// define USE_GPU", file=file1 ) print( "// use gpu or not", file=file1 ) print( "//", file=file1 )