diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 943b293..93704bb 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3869,6 +3869,7 @@ Parallel::SyncHandle *Parallel::SyncBegin(Patch *Pat, MyList *VarList, int ts->VarList1 = VarList; ts->VarList2 = VarList; ts->Symmetry = Symmetry; + ts->owns_gsl = true; ts->dst = build_ghost_gsl(Pat); ts->src = new MyList *[cpusize]; @@ -3909,6 +3910,7 @@ Parallel::SyncHandle *Parallel::SyncBegin(MyList *PatL, MyList *VarL ts->VarList1 = VarList; ts->VarList2 = VarList; ts->Symmetry = Symmetry; + ts->owns_gsl = true; ts->dst = build_ghost_gsl(Pp->data); ts->src = new MyList *[cpusize]; @@ -3933,6 +3935,7 @@ Parallel::SyncHandle *Parallel::SyncBegin(MyList *PatL, MyList *VarL ts->VarList1 = VarList; ts->VarList2 = VarList; ts->Symmetry = Symmetry; + ts->owns_gsl = true; ts->dst = build_buffer_gsl(PatL); ts->src = new MyList *[cpusize]; @@ -3961,26 +3964,150 @@ void Parallel::SyncEnd(SyncHandle *handle) TransferState *ts = &handle->states[i]; transfer_end(ts); - // Cleanup grid segment lists - if (ts->dst) - ts->dst->destroyList(); - for (int node = 0; node < ts->cpusize; node++) + // Cleanup grid segment lists only if this state owns them + if (ts->owns_gsl) { - if (ts->src[node]) - ts->src[node]->destroyList(); - if (ts->transfer_src[node]) - ts->transfer_src[node]->destroyList(); - if (ts->transfer_dst[node]) - ts->transfer_dst[node]->destroyList(); + if (ts->dst) + ts->dst->destroyList(); + for (int node = 0; node < ts->cpusize; node++) + { + if (ts->src[node]) + ts->src[node]->destroyList(); + if (ts->transfer_src[node]) + ts->transfer_src[node]->destroyList(); + if (ts->transfer_dst[node]) + ts->transfer_dst[node]->destroyList(); + } + delete[] ts->src; + delete[] ts->transfer_src; + delete[] ts->transfer_dst; } - delete[] ts->src; - delete[] ts->transfer_src; - delete[] ts->transfer_dst; } delete[] handle->states; delete handle; } +// +// SyncPreparePlan: Pre-build grid segment lists for a patch list. +// The plan can be reused across multiple SyncBeginWithPlan calls +// as long as the mesh topology does not change (no regridding). +// +Parallel::SyncPlan *Parallel::SyncPreparePlan(MyList *PatL, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + // Count patches + int num_patches = 0; + MyList *Pp = PatL; + while (Pp) { num_patches++; Pp = Pp->next; } + + SyncPlan *plan = new SyncPlan; + plan->num_entries = num_patches + 1; // intra-patch + 1 inter-patch + plan->Symmetry = Symmetry; + plan->entries = new SyncPlanEntry[plan->num_entries]; + + // Intra-patch entries: ghost zone exchange within each patch + int idx = 0; + Pp = PatL; + while (Pp) + { + SyncPlanEntry *pe = &plan->entries[idx]; + pe->cpusize = cpusize; + pe->dst = build_ghost_gsl(Pp->data); + pe->src = new MyList *[cpusize]; + pe->transfer_src = new MyList *[cpusize]; + pe->transfer_dst = new MyList *[cpusize]; + for (int node = 0; node < cpusize; node++) + { + pe->src[node] = build_owned_gsl0(Pp->data, node); + build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]); + } + idx++; + Pp = Pp->next; + } + + // Inter-patch entry: buffer zone exchange between patches + { + SyncPlanEntry *pe = &plan->entries[idx]; + pe->cpusize = cpusize; + pe->dst = build_buffer_gsl(PatL); + pe->src = new MyList *[cpusize]; + pe->transfer_src = new MyList *[cpusize]; + pe->transfer_dst = new MyList *[cpusize]; + for (int node = 0; node < cpusize; node++) + { + pe->src[node] = build_owned_gsl(PatL, node, 5, Symmetry); + build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]); + } + } + + return plan; +} +// +void Parallel::SyncFreePlan(SyncPlan *plan) +{ + if (!plan) + return; + + for (int i = 0; i < plan->num_entries; i++) + { + SyncPlanEntry *pe = &plan->entries[i]; + if (pe->dst) + pe->dst->destroyList(); + for (int node = 0; node < pe->cpusize; node++) + { + if (pe->src[node]) + pe->src[node]->destroyList(); + if (pe->transfer_src[node]) + pe->transfer_src[node]->destroyList(); + if (pe->transfer_dst[node]) + pe->transfer_dst[node]->destroyList(); + } + delete[] pe->src; + delete[] pe->transfer_src; + delete[] pe->transfer_dst; + } + delete[] plan->entries; + delete plan; +} +// +// SyncBeginWithPlan: Use pre-built GSLs from a SyncPlan to initiate async transfer. +// This avoids the O(cpusize * blocks^2) cost of rebuilding GSLs on every call. +// +Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList *VarList) +{ + return SyncBeginWithPlan(plan, VarList, VarList); +} +// +Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList *VarList1, MyList *VarList2) +{ + SyncHandle *handle = new SyncHandle; + handle->num_states = plan->num_entries; + handle->states = new TransferState[handle->num_states]; + + for (int i = 0; i < plan->num_entries; i++) + { + SyncPlanEntry *pe = &plan->entries[i]; + TransferState *ts = &handle->states[i]; + + ts->cpusize = pe->cpusize; + ts->VarList1 = VarList1; + ts->VarList2 = VarList2; + ts->Symmetry = plan->Symmetry; + ts->owns_gsl = false; // GSLs are owned by the plan, not this handle + + // Borrow GSL pointers from the plan (do NOT free them in SyncEnd) + ts->transfer_src = pe->transfer_src; + ts->transfer_dst = pe->transfer_dst; + ts->src = pe->src; + ts->dst = pe->dst; + + transfer_begin(ts); + } + + return handle; +} // collect buffer grid segments or blocks for the periodic boundary condition of given patch // --------------------------------------------------- // |con | |con | diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index e407aba..26ff2d0 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -98,6 +98,7 @@ namespace Parallel MyList *VarList1; MyList *VarList2; int Symmetry; + bool owns_gsl; // true if this state owns and should free the GSLs }; struct SyncHandle { @@ -107,6 +108,26 @@ namespace Parallel SyncHandle *SyncBegin(Patch *Pat, MyList *VarList, int Symmetry); SyncHandle *SyncBegin(MyList *PatL, MyList *VarList, int Symmetry); void SyncEnd(SyncHandle *handle); + + // Cached GSL plan: pre-build grid segment lists once, reuse across multiple Sync calls + struct SyncPlanEntry + { + int cpusize; + MyList **transfer_src; + MyList **transfer_dst; + MyList **src; + MyList *dst; + }; + struct SyncPlan + { + SyncPlanEntry *entries; + int num_entries; + int Symmetry; + }; + SyncPlan *SyncPreparePlan(MyList *PatL, int Symmetry); + void SyncFreePlan(SyncPlan *plan); + SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList *VarList); + SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList *VarList1, MyList *VarList2); void OutBdLow2Hi(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 6f4cd27..fbfbc51 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -186,6 +186,12 @@ void Z4c_class::Step(int lev, int YN) int ERROR = 0; MyList *sPp; + + // Pre-build grid segment lists once for this level's patches. + // These are reused across predictor + 3 corrector SyncBegin calls, + // avoiding O(cpusize * blocks^2) rebuild each time. + Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry); + // Predictor MyList *Pp = GH->PatL[lev]; while (Pp) @@ -321,13 +327,17 @@ void Z4c_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information + // Start async ghost zone exchange - overlaps with error check and Shell computation + Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre); + + // check error information (overlaps with MPI transfer) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::SyncEnd(sync_pre); sync_pre = 0; Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -475,6 +485,7 @@ void Z4c_class::Step(int lev, int YN) } if (ERROR) { + Parallel::SyncEnd(sync_pre); sync_pre = 0; SH->Dump_Data(StateList, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -485,7 +496,8 @@ void Z4c_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + // Complete async ghost zone exchange + if (sync_pre) Parallel::SyncEnd(sync_pre); #ifdef WithShell if (lev == 0) @@ -693,13 +705,17 @@ void Z4c_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information + // Start async ghost zone exchange - overlaps with error check and Shell computation + Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor); + + // check error information (overlaps with MPI transfer) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::SyncEnd(sync_cor); sync_cor = 0; Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -857,6 +873,7 @@ void Z4c_class::Step(int lev, int YN) } if (ERROR) { + Parallel::SyncEnd(sync_cor); sync_cor = 0; SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -868,7 +885,8 @@ void Z4c_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + // Complete async ghost zone exchange + if (sync_cor) Parallel::SyncEnd(sync_cor); #ifdef WithShell if (lev == 0) @@ -1042,6 +1060,8 @@ void Z4c_class::Step(int lev, int YN) Porg0[ithBH][2] = Porg1[ithBH][2]; } } + + Parallel::SyncFreePlan(sync_plan); } #else // for constraint preserving boundary (CPBC) @@ -1075,6 +1095,10 @@ void Z4c_class::Step(int lev, int YN) int ERROR = 0; MyList *sPp; + + // Pre-build grid segment lists once for this level's patches. + Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry); + // Predictor MyList *Pp = GH->PatL[lev]; while (Pp) @@ -1542,13 +1566,17 @@ void Z4c_class::Step(int lev, int YN) } #endif } - // check error information + // Start async ghost zone exchange - overlaps with error check + Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre); + + // check error information (overlaps with MPI transfer) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::SyncEnd(sync_pre); sync_pre = 0; SH->Dump_Data(StateList, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -1558,7 +1586,8 @@ void Z4c_class::Step(int lev, int YN) } } - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + // Complete async ghost zone exchange + if (sync_pre) Parallel::SyncEnd(sync_pre); if (lev == 0) { @@ -2103,13 +2132,17 @@ void Z4c_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Start async ghost zone exchange - overlaps with error check + Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor); + + // check error information (overlaps with MPI transfer) { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } if (ERROR) { + Parallel::SyncEnd(sync_cor); sync_cor = 0; SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -2120,7 +2153,8 @@ void Z4c_class::Step(int lev, int YN) } } - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + // Complete async ghost zone exchange + if (sync_cor) Parallel::SyncEnd(sync_cor); if (lev == 0) { @@ -2346,6 +2380,8 @@ void Z4c_class::Step(int lev, int YN) DG_List->clearList(); } #endif + + Parallel::SyncFreePlan(sync_plan); } #endif #undef MRBD diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 107f970..c374ffa 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -3035,6 +3035,12 @@ void bssn_class::Step(int lev, int YN) int ERROR = 0; MyList *sPp; + + // Pre-build grid segment lists once for this level's patches. + // These are reused across predictor + 3 corrector SyncBegin calls, + // avoiding O(cpusize * blocks^2) rebuild each time. + Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry); + // Predictor MyList *Pp = GH->PatL[lev]; while (Pp) @@ -3160,7 +3166,7 @@ void bssn_class::Step(int lev, int YN) } // Start async ghost zone exchange - overlaps with error check and Shell computation - Parallel::SyncHandle *sync_pre = Parallel::SyncBegin(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre); // check error information (overlaps with MPI transfer) { @@ -3536,7 +3542,7 @@ void bssn_class::Step(int lev, int YN) } // Start async ghost zone exchange - overlaps with error check and Shell computation - Parallel::SyncHandle *sync_cor = Parallel::SyncBegin(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor); // check error information (overlaps with MPI transfer) { @@ -3908,6 +3914,8 @@ void bssn_class::Step(int lev, int YN) Porg0[ithBH][2] = Porg1[ithBH][2]; } } + + Parallel::SyncFreePlan(sync_plan); } //================================================================================================ @@ -4830,6 +4838,12 @@ void bssn_class::Step(int lev, int YN) int ERROR = 0; MyList *sPp; + + // Pre-build grid segment lists once for this level's patches. + // These are reused across predictor + 3 corrector SyncBegin calls, + // avoiding O(cpusize * blocks^2) rebuild each time. + Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry); + // Predictor MyList *Pp = GH->PatL[lev]; while (Pp) @@ -4957,7 +4971,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); // Start async ghost zone exchange - overlaps with error check and BH position - Parallel::SyncHandle *sync_pre = Parallel::SyncBegin(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre); // check error information (overlaps with MPI transfer) { @@ -5159,7 +5173,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); // Start async ghost zone exchange - overlaps with error check and BH position - Parallel::SyncHandle *sync_cor = Parallel::SyncBegin(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor); // check error information (overlaps with MPI transfer) { @@ -5299,6 +5313,8 @@ void bssn_class::Step(int lev, int YN) // if(myrank==GH->start_rank[lev]) cout<mylev<Commlev[lev],GH->start_rank[lev],"complet GH Step"); + + Parallel::SyncFreePlan(sync_plan); } //================================================================================================