Optimize BSSN-EScalar CUDA path

This commit is contained in:
2026-05-05 10:47:46 +08:00
parent 06f62dee36
commit 85fe29cc2e
9 changed files with 1821 additions and 276 deletions

View File

@@ -70,6 +70,125 @@ int amss_analysis_map_every()
return every;
}
bool amss_rp_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_rp_detail_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_DETAIL_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_env_flag_enabled(const char *name)
{
const char *env = getenv(name);
return env && atoi(env) != 0;
}
bool amss_cached_rp_restrict_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_outbd_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_fine_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_rp_skip_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_evolve_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0;
return enabled != 0;
}
struct AmssEvolveTimingStats
{
double step;
double rp;
double regrid;
double constraint;
};
AmssEvolveTimingStats &amss_evolve_timing_stats()
{
static AmssEvolveTimingStats stats = {};
return stats;
}
void amss_evolve_timing_reset()
{
AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
stats.step = 0.0;
stats.rp = 0.0;
stats.regrid = 0.0;
stats.constraint = 0.0;
}
void amss_evolve_timing_add_step(double sec)
{
amss_evolve_timing_stats().step += sec;
}
void amss_evolve_timing_add_rp(double sec)
{
amss_evolve_timing_stats().rp += sec;
}
void amss_evolve_timing_add_regrid(double sec)
{
amss_evolve_timing_stats().regrid += sec;
}
void amss_evolve_timing_add_constraint(double sec)
{
amss_evolve_timing_stats().constraint += sec;
}
}
// Compile-time switch for per-timestep memory usage collection/printing.
@@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
}
int count_bssn_cuda_state_list(MyList<var> *vars)
{
int count = 0;
while (vars)
{
++count;
vars = vars->next;
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
return -1;
}
return count;
}
bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
int state_count,
double **host_views)
{
if (!cg || !host_views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false;
int idx = 0;
while (vars && idx < state_count)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
vars = vars->next;
++idx;
}
return idx == state_count && vars == 0;
}
bool bssn_cuda_use_resident_sync(int lev)
{
#ifdef WithShell
@@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{
var *vars[3] = {forx, fory, forz};
double *bh_host_key[3] = {
block->fgfs[forx->sgfn],
block->fgfs[fory->sgfn],
block->fgfs[forz->sgfn]
};
double soa3[9];
for (int f = 0; f < 3; f++)
{
@@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
DH[0], DH[1], DH[2],
x, y, z,
interp_ordn, interp_sym,
bh_host_key,
soa3, shellf) != 0)
{
const int sx = ordn;
@@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL;
while (Pp)
{
@@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
if (rc)
{
cout << "CUDA resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
{
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL;
while (Pp)
{
@@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out))
if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count))
{
cout << "CUDA resident state conditional download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++)
{
const bool evolve_timing = amss_evolve_timing_enabled();
const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0;
if (evolve_timing)
amss_evolve_timing_reset();
cuda_level0_constraint_cache_valid = false;
#if BSSN_FINE_TIMING
step_timing::reset();
@@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps)
// misc::tillherecheck("before Constraint_Out");
const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_constraint_out);
Constraint_Out(); // this will affect the Dump_List
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
if (evolve_timing)
amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
LastDump += dT_mon;
Last2dDump += dT_mon;
@@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps)
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start);
#endif
if (evolve_timing)
{
const AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint};
double maxv[4] = {};
MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
{
const double wall = MPI_Wtime() - evolve_t0;
const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3];
fprintf(stderr,
"[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f "
"regrid=%.6f constraint=%.6f other=%.6f\n",
ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known);
}
}
}
/*
#ifdef With_AHF
@@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev)
{
// if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl;
YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level
const bool evolve_timing = amss_evolve_timing_enabled();
const double step_t0 = evolve_timing ? MPI_Wtime() : 0.0;
Step(lev, YN);
if (evolve_timing)
amss_evolve_timing_add_step(MPI_Wtime() - step_t0);
#if (AGM == 2)
if (GH->levels == 1)
@@ -3195,7 +3383,10 @@ void bssn_class::RecursiveStep(int lev)
//
// till here the PhysTime has updated dT_lev
// if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl;
const double rp_t0 = evolve_timing ? MPI_Wtime() : 0.0;
RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor);
if (evolve_timing)
amss_evolve_timing_add_rp(MPI_Wtime() - rp_t0);
// RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
#ifdef WithShell
@@ -3224,6 +3415,8 @@ void bssn_class::RecursiveStep(int lev)
#endif
#if (REGLEV == 0)
const bool evolve_timing = amss_evolve_timing_enabled();
const double regrid_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_regrid_onelevel);
#if USE_CUDA_BSSN
if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0))
@@ -3242,6 +3435,8 @@ void bssn_class::RecursiveStep(int lev)
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif
}
if (evolve_timing)
amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0);
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif
}
@@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
const bool rp_runtime_timing = amss_rp_timing_enabled();
const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0;
const bool rp_detail_timing = amss_rp_detail_timing_enabled();
double rp_t_prepare = 0.0;
double rp_t_restrict = 0.0;
double rp_t_coarse_sync = 0.0;
double rp_t_outbd = 0.0;
double rp_t_fine_sync = 0.0;
double rp_t0 = 0.0;
STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2)
// stringstream a_stream;
@@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
MyList<Patch> *Pp, *Ppc;
if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level
{
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Pp = GH->PatL[lev - 1];
while (Pp)
{
@@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
Pp = Pp->next;
}
if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0;
#if (PSTR == 1 || PSTR == 2)
// Pp=GH->PatL[lev];
@@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif
#endif
@@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
@@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
#endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
#endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif
#endif
@@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
@@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
}
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#else
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
}
if (rp_runtime_timing)
{
const double local_sec = MPI_Wtime() - rp_runtime_start;
double max_sec = 0.0;
MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n",
lev, YN, BB ? 1 : 0, max_sec);
}
if (rp_detail_timing)
{
double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync};
double max_detail[5] = {};
MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr,
"[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f "
"coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n",
lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1],
max_detail[2], max_detail[3], max_detail[4]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
@@ -7229,7 +7484,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#endif
@@ -7239,7 +7497,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7249,16 +7513,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
while (Ppc)
if (amss_cached_rp_outbd_enabled())
{
Pp = GH->PatL[lev];
while (Pp)
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
}
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
Pp = Pp->next;
Pp = GH->PatL[lev];
while (Pp)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
Pp = Pp->next;
}
Ppc = Ppc->next;
}
Ppc = Ppc->next;
}
#else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7277,7 +7548,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
#endif
@@ -7287,7 +7561,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7297,16 +7577,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
while (Ppc)
if (amss_cached_rp_outbd_enabled())
{
Pp = GH->PatL[lev];
while (Pp)
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
}
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
Pp = Pp->next;
Pp = GH->PatL[lev];
while (Pp)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
Pp = Pp->next;
}
Ppc = Ppc->next;
}
Ppc = Ppc->next;
}
#else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7321,7 +7608,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
}
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (amss_cached_rp_fine_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
else
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
#endif