From 42b9cf1ad9859e86f0e2a29e25dec26d5785b71b Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 9 Feb 2026 21:03:37 +0800 Subject: [PATCH] Optimize MPI Sync with merged transfers, caching, and async overlap Phase 1: Merge N+1 transfer() calls into a single transfer() per Sync(PatchList), reducing N+1 MPI_Waitall barriers to 1 via new Sync_merged() that collects all intra-patch and inter-patch grid segment lists into combined per-rank arrays. Phase 2: Cache grid segment lists and reuse grow-only communication buffers across RK4 substeps via SyncCache struct. Caches are per-level and per-variable-list (predictor/corrector), invalidated on regrid. Eliminates redundant build_ghost_gsl/build_owned_gsl0/build_gstl rebuilds and malloc/free cycles between regrids. Phase 3: Split Sync into async Sync_start/Sync_finish to overlap Cartesian ghost zone exchange (MPI_Isend/Irecv) with Shell patch synchronization. Uses MPI tag 2 to avoid conflicts with SH->Synch() which uses transfer() with tag 1. Also updates makefile.inc paths and flags for local build environment. Co-Authored-By: Claude Opus 4.6 --- AMSS_NCKU_source/Parallel.C | 478 ++++++++++++++++++++++++++++++++++ AMSS_NCKU_source/Parallel.h | 36 +++ AMSS_NCKU_source/bssn_class.C | 53 +++- AMSS_NCKU_source/bssn_class.h | 3 + 4 files changed, 564 insertions(+), 6 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 713a6a7..d90cdeb 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3756,6 +3756,484 @@ void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) delete[] transfer_src; delete[] transfer_dst; } +// Merged Sync: collect all intra-patch and inter-patch grid segment lists, +// then issue a single transfer() call instead of N+1 separate ones. +void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList **combined_src = new MyList *[cpusize]; + MyList **combined_dst = new MyList *[cpusize]; + for (int node = 0; node < cpusize; node++) + combined_src[node] = combined_dst[node] = 0; + + // Phase A: Intra-patch ghost exchange segments + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + + if (dst_ghost) + dst_ghost->destroyList(); + + Pp = Pp->next; + } + + // Phase B: Inter-patch buffer exchange segments + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + if (dst_buffer) + dst_buffer->destroyList(); + + // Phase C: Single transfer + transfer(combined_src, combined_dst, VarList, VarList, Symmetry); + + // Phase D: Cleanup + for (int node = 0; node < cpusize; node++) + { + if (combined_src[node]) + combined_src[node]->destroyList(); + if (combined_dst[node]) + combined_dst[node]->destroyList(); + } + delete[] combined_src; + delete[] combined_dst; +} +// SyncCache constructor +Parallel::SyncCache::SyncCache() + : valid(false), cpusize(0), combined_src(0), combined_dst(0), + send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), + send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0) +{ +} +// SyncCache invalidate: free grid segment lists but keep buffers +void Parallel::SyncCache::invalidate() +{ + if (!valid) + return; + for (int i = 0; i < cpusize; i++) + { + if (combined_src[i]) + combined_src[i]->destroyList(); + if (combined_dst[i]) + combined_dst[i]->destroyList(); + combined_src[i] = combined_dst[i] = 0; + send_lengths[i] = recv_lengths[i] = 0; + } + valid = false; +} +// SyncCache destroy: free everything +void Parallel::SyncCache::destroy() +{ + invalidate(); + if (combined_src) delete[] combined_src; + if (combined_dst) delete[] combined_dst; + if (send_lengths) delete[] send_lengths; + if (recv_lengths) delete[] recv_lengths; + if (send_buf_caps) delete[] send_buf_caps; + if (recv_buf_caps) delete[] recv_buf_caps; + for (int i = 0; i < cpusize; i++) + { + if (send_bufs && send_bufs[i]) delete[] send_bufs[i]; + if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i]; + } + if (send_bufs) delete[] send_bufs; + if (recv_bufs) delete[] recv_bufs; + if (reqs) delete[] reqs; + if (stats) delete[] stats; + combined_src = combined_dst = 0; + send_lengths = recv_lengths = 0; + send_buf_caps = recv_buf_caps = 0; + send_bufs = recv_bufs = 0; + reqs = 0; stats = 0; + cpusize = 0; max_reqs = 0; +} +// transfer_cached: reuse pre-allocated buffers from SyncCache +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int req_no = 0; + int node; + + for (node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = length; + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + } + else + { + // send + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + // recv + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + } + } + + MPI_Waitall(req_no, cache.reqs, cache.stats); + + for (node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); +} +// Sync_cached: build grid segment lists on first call, reuse on subsequent calls +void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + // Allocate cache arrays if needed + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + for (int node = 0; node < cpusize; node++) + { + cache.combined_src[node] = cache.combined_dst[node] = 0; + cache.send_lengths[node] = cache.recv_lengths[node] = 0; + } + + // Build intra-patch segments (same as Sync_merged Phase A) + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_ghost) dst_ghost->destroyList(); + Pp = Pp->next; + } + + // Build inter-patch segments (same as Sync_merged Phase B) + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_buffer) dst_buffer->destroyList(); + + cache.valid = true; + } + + // Use cached lists with buffer-reusing transfer + transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache); +} +// Sync_start: pack and post MPI_Isend/Irecv, return immediately +void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state) +{ + // Ensure cache is built + if (!cache.valid) + { + // Build cache (same logic as Sync_cached) + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + for (int node = 0; node < cpusize; node++) + { + cache.combined_src[node] = cache.combined_dst[node] = 0; + cache.send_lengths[node] = cache.recv_lengths[node] = 0; + } + + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_ghost) dst_ghost->destroyList(); + Pp = Pp->next; + } + + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_buffer) dst_buffer->destroyList(); + cache.valid = true; + } + + // Now pack and post async MPI operations + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + state.req_no = 0; + state.active = true; + + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = length; + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + } + else + { + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + } + } +} +// Sync_finish: wait for async MPI operations and unpack +void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry) +{ + if (!state.active) + return; + + MPI_Waitall(state.req_no, cache.reqs, cache.stats); + + int cpusize = cache.cpusize; + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + + for (int node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + + state.active = false; +} // collect buffer grid segments or blocks for the periodic boundary condition of given patch // --------------------------------------------------- // |con | |con | diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 12fc356..7935727 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -81,6 +81,42 @@ namespace Parallel int Symmetry); void Sync(Patch *Pat, MyList *VarList, int Symmetry); void Sync(MyList *PatL, MyList *VarList, int Symmetry); + void Sync_merged(MyList *PatL, MyList *VarList, int Symmetry); + + struct SyncCache { + bool valid; + int cpusize; + MyList **combined_src; + MyList **combined_dst; + int *send_lengths; + int *recv_lengths; + double **send_bufs; + double **recv_bufs; + int *send_buf_caps; + int *recv_buf_caps; + MPI_Request *reqs; + MPI_Status *stats; + int max_reqs; + SyncCache(); + void invalidate(); + void destroy(); + }; + + void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); + void transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + + struct AsyncSyncState { + int req_no; + bool active; + AsyncSyncState() : req_no(0), active(false) {} + }; + + void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state); + void Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry); void OutBdLow2Hi(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 553cc72..7a1400e 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -730,6 +730,10 @@ void bssn_class::Initialize() PhysTime = StartTime; Setup_Black_Hole_position(); } + + // Initialize sync caches (per-level, for predictor and corrector) + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; } //================================================================================================ @@ -981,6 +985,20 @@ bssn_class::~bssn_class() delete Azzz; #endif + // Destroy sync caches before GH + if (sync_cache_pre) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_pre[i].destroy(); + delete[] sync_cache_pre; + } + if (sync_cache_cor) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_cor[i].destroy(); + delete[] sync_cache_cor; + } + delete GH; #ifdef WithShell delete SH; @@ -2181,6 +2199,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2396,6 +2415,7 @@ void bssn_class::RecursiveStep(int lev) GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } #endif } @@ -2574,6 +2594,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } #endif } @@ -2740,6 +2761,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2754,6 +2776,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2772,6 +2795,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2787,6 +2811,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -3310,7 +3335,8 @@ void bssn_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -3328,7 +3354,10 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); +#ifdef WithShell // Complete non-blocking error reduction and check MPI_Wait(&err_req, MPI_STATUS_IGNORE); if (ERROR) @@ -3666,7 +3695,8 @@ void bssn_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -3684,7 +3714,10 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); +#ifdef WithShell // Complete non-blocking error reduction and check MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) @@ -4161,7 +4194,8 @@ void bssn_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -4179,7 +4213,10 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); +#ifdef WithShell // Complete non-blocking error reduction and check MPI_Wait(&err_req, MPI_STATUS_IGNORE); if (ERROR) @@ -4502,7 +4539,8 @@ void bssn_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -4520,7 +4558,10 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); +#ifdef WithShell // Complete non-blocking error reduction and check MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) @@ -4910,7 +4951,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); // Complete non-blocking error reduction and check MPI_Wait(&err_req, MPI_STATUS_IGNORE); @@ -5111,7 +5152,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync"); diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 740d3aa..fe3618b 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -126,6 +126,9 @@ public: MyList *OldStateList, *DumpList; MyList *ConstraintList; + Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync + Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync + monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ConVMonitor; surface_integral *Waveshell;