From cc06e304046d83507847811e0972fbf55d502f4b Mon Sep 17 00:00:00 2001 From: ianchb Date: Fri, 20 Feb 2026 09:50:40 +0800 Subject: [PATCH] Apply async Sync optimization to Z4c_class using Sync_start/finish pattern Replaces blocking Parallel::Sync + MPI_Allreduce in Z4c_class Step() with non-blocking MPI_Iallreduce overlapped with Sync_start/Sync_finish, matching the pattern already used in bssn_class on cjy-oneapi-opus-hotfix. Covers both ABEtype==2 and CPBC variants (predictor + corrector = 4 call sites). Cherry-picked optimization from afd4006, adapted to SyncCache infrastructure instead of the separate SyncPlan API. Co-Authored-By: Claude Opus 4.6 --- AMSS_NCKU_source/Z4c_class.C | 234 ++++++++++++++++------------------- 1 file changed, 104 insertions(+), 130 deletions(-) diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 6f4cd27..1563b9a 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -321,22 +321,7 @@ void Z4c_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -354,9 +339,9 @@ void Z4c_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif @@ -468,24 +453,16 @@ void Z4c_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_pre; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -498,12 +475,30 @@ void Z4c_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } #endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); + SH->Dump_Data(StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -693,23 +688,7 @@ void Z4c_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -850,25 +829,16 @@ void Z4c_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count - << " variables at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -881,11 +851,30 @@ void Z4c_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); + SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } #endif // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -1252,22 +1241,7 @@ void Z4c_class::Step(int lev, int YN) } } #endif - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // evolve Shell Patches if (lev == 0) @@ -1542,23 +1516,15 @@ void Z4c_class::Step(int lev, int YN) } #endif } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_pre; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre); } - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); if (lev == 0) { @@ -1570,8 +1536,8 @@ void Z4c_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } @@ -1620,6 +1586,22 @@ void Z4c_class::Step(int lev, int YN) } #endif } + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); + SH->Dump_Data(StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -1841,23 +1823,7 @@ void Z4c_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // evolve Shell Patches if (lev == 0) @@ -2103,24 +2069,15 @@ void Z4c_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count - << " variables at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); if (lev == 0) { @@ -2132,8 +2089,8 @@ void Z4c_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } @@ -2170,6 +2127,23 @@ void Z4c_class::Step(int lev, int YN) } // end smooth #endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); + SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } // for black hole position if (BH_num > 0 && lev == GH->levels - 1)