Optimize BSSN-EScalar CUDA path
This commit is contained in:
@@ -70,6 +70,125 @@ int amss_analysis_map_every()
|
||||
return every;
|
||||
}
|
||||
|
||||
bool amss_rp_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_RP_TIMING");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_rp_detail_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_RP_DETAIL_TIMING");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_env_flag_enabled(const char *name)
|
||||
{
|
||||
const char *env = getenv(name);
|
||||
return env && atoi(env) != 0;
|
||||
}
|
||||
|
||||
bool amss_cached_rp_restrict_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_cached_rp_outbd_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_cached_rp_fine_sync_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_cached_rp_coarse_sync_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_rp_skip_coarse_sync_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool amss_evolve_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0;
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
struct AmssEvolveTimingStats
|
||||
{
|
||||
double step;
|
||||
double rp;
|
||||
double regrid;
|
||||
double constraint;
|
||||
};
|
||||
|
||||
AmssEvolveTimingStats &amss_evolve_timing_stats()
|
||||
{
|
||||
static AmssEvolveTimingStats stats = {};
|
||||
return stats;
|
||||
}
|
||||
|
||||
void amss_evolve_timing_reset()
|
||||
{
|
||||
AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
|
||||
stats.step = 0.0;
|
||||
stats.rp = 0.0;
|
||||
stats.regrid = 0.0;
|
||||
stats.constraint = 0.0;
|
||||
}
|
||||
|
||||
void amss_evolve_timing_add_step(double sec)
|
||||
{
|
||||
amss_evolve_timing_stats().step += sec;
|
||||
}
|
||||
|
||||
void amss_evolve_timing_add_rp(double sec)
|
||||
{
|
||||
amss_evolve_timing_stats().rp += sec;
|
||||
}
|
||||
|
||||
void amss_evolve_timing_add_regrid(double sec)
|
||||
{
|
||||
amss_evolve_timing_stats().regrid += sec;
|
||||
}
|
||||
|
||||
void amss_evolve_timing_add_constraint(double sec)
|
||||
{
|
||||
amss_evolve_timing_stats().constraint += sec;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Compile-time switch for per-timestep memory usage collection/printing.
|
||||
@@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
|
||||
}
|
||||
|
||||
int count_bssn_cuda_state_list(MyList<var> *vars)
|
||||
{
|
||||
int count = 0;
|
||||
while (vars)
|
||||
{
|
||||
++count;
|
||||
vars = vars->next;
|
||||
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||
return -1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
|
||||
int state_count,
|
||||
double **host_views)
|
||||
{
|
||||
if (!cg || !host_views ||
|
||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
||||
return false;
|
||||
int idx = 0;
|
||||
while (vars && idx < state_count)
|
||||
{
|
||||
host_views[idx] = cg->fgfs[vars->data->sgfn];
|
||||
vars = vars->next;
|
||||
++idx;
|
||||
}
|
||||
return idx == state_count && vars == 0;
|
||||
}
|
||||
|
||||
bool bssn_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
#ifdef WithShell
|
||||
@@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||
{
|
||||
var *vars[3] = {forx, fory, forz};
|
||||
double *bh_host_key[3] = {
|
||||
block->fgfs[forx->sgfn],
|
||||
block->fgfs[fory->sgfn],
|
||||
block->fgfs[forz->sgfn]
|
||||
};
|
||||
double soa3[9];
|
||||
for (int f = 0; f < 3; f++)
|
||||
{
|
||||
@@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
DH[0], DH[1], DH[2],
|
||||
x, y, z,
|
||||
interp_ordn, interp_sym,
|
||||
bh_host_key,
|
||||
soa3, shellf) != 0)
|
||||
{
|
||||
const int sx = ordn;
|
||||
@@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
|
||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||
{
|
||||
const int state_count = count_bssn_cuda_state_list(vars);
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
@@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
|
||||
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
|
||||
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
|
||||
if (rc)
|
||||
{
|
||||
cout << "CUDA resident state download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
@@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
|
||||
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
|
||||
{
|
||||
const int state_count = count_bssn_cuda_state_list(vars);
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
@@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out))
|
||||
if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count))
|
||||
{
|
||||
cout << "CUDA resident state conditional download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
@@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps)
|
||||
|
||||
for (int ncount = 1; ncount < Steps + 1; ncount++)
|
||||
{
|
||||
const bool evolve_timing = amss_evolve_timing_enabled();
|
||||
const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0;
|
||||
if (evolve_timing)
|
||||
amss_evolve_timing_reset();
|
||||
cuda_level0_constraint_cache_valid = false;
|
||||
#if BSSN_FINE_TIMING
|
||||
step_timing::reset();
|
||||
@@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps)
|
||||
|
||||
// misc::tillherecheck("before Constraint_Out");
|
||||
|
||||
const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
|
||||
STEP_TIMER_DECL(timer_constraint_out);
|
||||
Constraint_Out(); // this will affect the Dump_List
|
||||
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
|
||||
if (evolve_timing)
|
||||
amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
|
||||
|
||||
LastDump += dT_mon;
|
||||
Last2dDump += dT_mon;
|
||||
@@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps)
|
||||
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
|
||||
rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start);
|
||||
#endif
|
||||
if (evolve_timing)
|
||||
{
|
||||
const AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
|
||||
const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint};
|
||||
double maxv[4] = {};
|
||||
MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
|
||||
if (myrank == 0)
|
||||
{
|
||||
const double wall = MPI_Wtime() - evolve_t0;
|
||||
const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3];
|
||||
fprintf(stderr,
|
||||
"[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f "
|
||||
"regrid=%.6f constraint=%.6f other=%.6f\n",
|
||||
ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*
|
||||
#ifdef With_AHF
|
||||
@@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev)
|
||||
{
|
||||
// if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl;
|
||||
YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level
|
||||
const bool evolve_timing = amss_evolve_timing_enabled();
|
||||
const double step_t0 = evolve_timing ? MPI_Wtime() : 0.0;
|
||||
Step(lev, YN);
|
||||
if (evolve_timing)
|
||||
amss_evolve_timing_add_step(MPI_Wtime() - step_t0);
|
||||
|
||||
#if (AGM == 2)
|
||||
if (GH->levels == 1)
|
||||
@@ -3195,7 +3383,10 @@ void bssn_class::RecursiveStep(int lev)
|
||||
//
|
||||
// till here the PhysTime has updated dT_lev
|
||||
// if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl;
|
||||
const double rp_t0 = evolve_timing ? MPI_Wtime() : 0.0;
|
||||
RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor);
|
||||
if (evolve_timing)
|
||||
amss_evolve_timing_add_rp(MPI_Wtime() - rp_t0);
|
||||
// RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
|
||||
|
||||
#ifdef WithShell
|
||||
@@ -3224,6 +3415,8 @@ void bssn_class::RecursiveStep(int lev)
|
||||
#endif
|
||||
|
||||
#if (REGLEV == 0)
|
||||
const bool evolve_timing = amss_evolve_timing_enabled();
|
||||
const double regrid_t0 = evolve_timing ? MPI_Wtime() : 0.0;
|
||||
STEP_TIMER_DECL(timer_regrid_onelevel);
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0))
|
||||
@@ -3242,6 +3435,8 @@ void bssn_class::RecursiveStep(int lev)
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||
#endif
|
||||
}
|
||||
if (evolve_timing)
|
||||
amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0);
|
||||
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
|
||||
#endif
|
||||
}
|
||||
@@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
//
|
||||
// SynchList_cor old -----------
|
||||
{
|
||||
const bool rp_runtime_timing = amss_rp_timing_enabled();
|
||||
const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0;
|
||||
const bool rp_detail_timing = amss_rp_detail_timing_enabled();
|
||||
double rp_t_prepare = 0.0;
|
||||
double rp_t_restrict = 0.0;
|
||||
double rp_t_coarse_sync = 0.0;
|
||||
double rp_t_outbd = 0.0;
|
||||
double rp_t_fine_sync = 0.0;
|
||||
double rp_t0 = 0.0;
|
||||
STEP_TIMER_DECL(timer_restrict_prolong);
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// stringstream a_stream;
|
||||
@@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
MyList<Patch> *Pp, *Ppc;
|
||||
if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level
|
||||
{
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Pp = GH->PatL[lev - 1];
|
||||
while (Pp)
|
||||
{
|
||||
@@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0;
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// Pp=GH->PatL[lev];
|
||||
@@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
|
||||
#else
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||
#endif
|
||||
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
|
||||
#elif (RPB == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
@@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
|
||||
#else
|
||||
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
#if (MIXOUTB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
@@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||
#endif
|
||||
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
|
||||
#elif (RPB == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
|
||||
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
@@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
#else
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
|
||||
#endif
|
||||
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
|
||||
#elif (RPB == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
||||
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
@@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
|
||||
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
|
||||
#else
|
||||
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
#if (MIXOUTB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
@@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
#endif
|
||||
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
|
||||
#elif (RPB == 1)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
|
||||
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
@@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
}
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
|
||||
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
|
||||
#else
|
||||
if (rp_detail_timing) rp_t0 = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
|
||||
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
|
||||
#endif
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
@@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||
#endif
|
||||
}
|
||||
if (rp_runtime_timing)
|
||||
{
|
||||
const double local_sec = MPI_Wtime() - rp_runtime_start;
|
||||
double max_sec = 0.0;
|
||||
MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
|
||||
if (myrank == 0)
|
||||
fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n",
|
||||
lev, YN, BB ? 1 : 0, max_sec);
|
||||
}
|
||||
if (rp_detail_timing)
|
||||
{
|
||||
double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync};
|
||||
double max_detail[5] = {};
|
||||
MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
|
||||
if (myrank == 0)
|
||||
fprintf(stderr,
|
||||
"[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f "
|
||||
"coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n",
|
||||
lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1],
|
||||
max_detail[2], max_detail[3], max_detail[4]);
|
||||
}
|
||||
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
|
||||
}
|
||||
|
||||
@@ -7229,7 +7484,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
|
||||
#if (RPB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
|
||||
if (amss_cached_rp_restrict_enabled())
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||
else
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
|
||||
#else
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||
#endif
|
||||
@@ -7239,7 +7497,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
#endif
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
if (amss_rp_skip_coarse_sync_enabled())
|
||||
{
|
||||
}
|
||||
else if (amss_cached_rp_coarse_sync_enabled())
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
else
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
#else
|
||||
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
@@ -7249,16 +7513,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
#if (RPB == 0)
|
||||
#if (MIXOUTB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
if (amss_cached_rp_outbd_enabled())
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||
}
|
||||
else
|
||||
{
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
Pp = Pp->next;
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#else
|
||||
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||
@@ -7277,7 +7548,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
|
||||
#if (RPB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
||||
if (amss_cached_rp_restrict_enabled())
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
|
||||
else
|
||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
||||
#else
|
||||
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
|
||||
#endif
|
||||
@@ -7287,7 +7561,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
#endif
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
|
||||
if (amss_rp_skip_coarse_sync_enabled())
|
||||
{
|
||||
}
|
||||
else if (amss_cached_rp_coarse_sync_enabled())
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
else
|
||||
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
|
||||
#else
|
||||
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
@@ -7297,16 +7577,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
#if (RPB == 0)
|
||||
#if (MIXOUTB == 0)
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
if (amss_cached_rp_outbd_enabled())
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||
}
|
||||
else
|
||||
{
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
Pp = Pp->next;
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#else
|
||||
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||
@@ -7321,7 +7608,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
}
|
||||
|
||||
#if (ABEtype == 1 || ABEtype == 2)
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
if (amss_cached_rp_fine_sync_enabled())
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||
else
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
#else
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user