From 5c1790277bf6eaf6c6cd2d881385b6aff03a661c Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 11 Feb 2026 16:09:08 +0800 Subject: [PATCH] Replace nested OutBdLow2Hi loops with batch calls in RestrictProlong MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 8 nested while(Ppc){while(Pp){OutBdLow2Hi(single,single,...)}} loops across RestrictProlong (3 overloads) and ProlongRestrict each produced N_c × N_f separate transfer() → MPI_Waitall barriers. Replace with the existing batch OutBdLow2Hi(MyList*,...) which merges all patch pairs into a single transfer() call with 1 MPI_Waitall. Also add Restrict_cached, OutBdLow2Hi_cached, OutBdLow2Himix_cached to Parallel (unused for now — kept as infrastructure for future use). Co-Authored-By: Claude Opus 4.6 (1M context) --- AMSS_NCKU_source/Parallel.C | 197 ++++++++++++++++++++++++++++++++++ AMSS_NCKU_source/Parallel.h | 9 ++ AMSS_NCKU_source/bssn_class.C | 112 +++---------------- 3 files changed, 222 insertions(+), 96 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 0cd50a2..a9fb3cd 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -5286,6 +5286,203 @@ void Parallel::OutBdLow2Himix(MyList *PatcL, MyList *PatfL, delete[] transfer_src; delete[] transfer_dst; } + +// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_complete_gsl(PatcL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + // Use transfermix instead of transfer for mix-mode interpolation + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int req_no = 0; + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = length; + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + } + else + { + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + } + } + + MPI_Waitall(req_no, cache.reqs, cache.stats); + + for (int node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); +} + // collect all buffer grid segments or blocks for given patch MyList *Parallel::build_buffer_gsl(Patch *Pat) { diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 6ab22af..a6ef351 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -130,6 +130,15 @@ namespace Parallel void OutBdLow2Himix(MyList *PatcL, MyList *PatfL, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); + void Restrict_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + void OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + void OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); void Prolong(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 927bff5..18f1388 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -5819,21 +5819,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); @@ -5880,21 +5870,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); @@ -5969,21 +5949,11 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); @@ -6001,21 +5971,11 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); @@ -6076,21 +6036,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6110,21 +6060,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6161,21 +6101,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) } #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6184,21 +6114,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) else // no time refinement levels and for all same time levels { #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);