From ed1d86ade93181fbfd2ab21a0293512e9f1d0869 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 9 Feb 2026 12:12:16 +0800 Subject: [PATCH] Merge paired MPI_Allreduce error checks to reduce global sync barriers In the two Step() functions that handle both Patch and Shell Patch, defer the Patch error check until after Shell Patch computation completes, then perform a single combined MPI_Allreduce instead of two separate ones. This eliminates 4 MPI_Allreduce calls per timestep (2 per Step function, Predictor + Corrector phases each). The optimization is mathematically equivalent: in normal execution (no NaN) behavior is identical; on error, both Patch and Shell data are dumped before MPI_Abort. Co-Authored-By: Claude Opus 4.6 --- AMSS_NCKU_source/bssn_class.C | 119 +++++++++------------------------- 1 file changed, 31 insertions(+), 88 deletions(-) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index fc6c88e..e14092b 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -3158,21 +3158,7 @@ void bssn_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -3190,9 +3176,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif @@ -3316,7 +3302,7 @@ void bssn_class::Step(int lev, int YN) #endif } - // check error information + // check error information (combined Patch + Shell Patch check) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); @@ -3324,11 +3310,12 @@ void bssn_class::Step(int lev, int YN) if (ERROR) { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); SH->Dump_Data(StateList, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } @@ -3528,24 +3515,7 @@ void bssn_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -3563,9 +3533,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #elif (AGM == 1) if (iter_count == 3) @@ -3685,20 +3655,21 @@ void bssn_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // check error information (combined Patch + Shell Patch check) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" - << iter_count << " variables at t = " - << PhysTime << endl; + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } @@ -4034,22 +4005,7 @@ void bssn_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -4067,15 +4023,15 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[fngfs + ShellPatch::gx], - cg->fgfs[fngfs + ShellPatch::gy], + cg->fgfs[fngfs + ShellPatch::gx], + cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gz], cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhody], @@ -4190,19 +4146,20 @@ void bssn_class::Step(int lev, int YN) } #endif } - // check error information + // check error information (combined Patch + Shell Patch check) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); SH->Dump_Data(StateList, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " - << PhysTime << endl; + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime + << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } @@ -4386,23 +4343,7 @@ void bssn_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -4420,9 +4361,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #elif (AGM == 1) if (iter_count == 3) @@ -4542,19 +4483,21 @@ void bssn_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // check error information (combined Patch + Shell Patch check) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count - << " variables at t = " << PhysTime << endl; + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } }