Stabilize EScalar CUDA fallback path
This commit is contained in:
@@ -140,6 +140,88 @@ bool escalar_gpu_rk_enabled()
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool escalar_resident_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_ESCALAR_RESIDENT");
|
||||
const char *experimental = getenv("AMSS_ESCALAR_RESIDENT_EXPERIMENTAL");
|
||||
enabled = (env && atoi(env) != 0 &&
|
||||
experimental && atoi(experimental) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool escalar_step_profile_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
int escalar_step_profile_every()
|
||||
{
|
||||
static int every = -1;
|
||||
if (every < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE_EVERY");
|
||||
every = (env && atoi(env) > 0) ? atoi(env) : 1;
|
||||
}
|
||||
return every;
|
||||
}
|
||||
|
||||
struct EScalarStepProfile
|
||||
{
|
||||
double start;
|
||||
double predictor_rhs;
|
||||
double predictor_sync;
|
||||
double analysis;
|
||||
double corrector_rhs;
|
||||
double corrector_sync;
|
||||
double restrict_prolong;
|
||||
double other_sync;
|
||||
};
|
||||
|
||||
void escalar_profile_init(EScalarStepProfile &p)
|
||||
{
|
||||
p.start = MPI_Wtime();
|
||||
p.predictor_rhs = 0.0;
|
||||
p.predictor_sync = 0.0;
|
||||
p.analysis = 0.0;
|
||||
p.corrector_rhs = 0.0;
|
||||
p.corrector_sync = 0.0;
|
||||
p.restrict_prolong = 0.0;
|
||||
p.other_sync = 0.0;
|
||||
}
|
||||
|
||||
void escalar_profile_add(double &bucket, double t0)
|
||||
{
|
||||
bucket += MPI_Wtime() - t0;
|
||||
}
|
||||
|
||||
void escalar_profile_report(const EScalarStepProfile &p, int lev, int myrank)
|
||||
{
|
||||
if (myrank != 0 || !escalar_step_profile_enabled())
|
||||
return;
|
||||
static long long call_count = 0;
|
||||
++call_count;
|
||||
const int every = escalar_step_profile_every();
|
||||
if (every > 1 && (call_count % every) != 0)
|
||||
return;
|
||||
const double total = MPI_Wtime() - p.start;
|
||||
fprintf(stderr,
|
||||
"[AMSS-ESCALAR-PROFILE] call=%lld lev=%d total=%.6f pred_rhs=%.6f pred_sync=%.6f analysis=%.6f corr_rhs=%.6f corr_sync=%.6f rp=%.6f other_sync=%.6f\n",
|
||||
call_count, lev, total, p.predictor_rhs, p.predictor_sync,
|
||||
p.analysis, p.corrector_rhs, p.corrector_sync,
|
||||
p.restrict_prolong, p.other_sync);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
void clear_var_list(MyList<var> *&list)
|
||||
{
|
||||
if (list)
|
||||
@@ -173,6 +255,34 @@ void download_bssn_cuda_prefix_if_present(MyList<Patch> *PatL,
|
||||
}
|
||||
}
|
||||
|
||||
void download_escalar_cuda_pair_if_present(MyList<Patch> *PatL,
|
||||
var *Sphi_var,
|
||||
var *Spi_var,
|
||||
int myrank)
|
||||
{
|
||||
if (!Sphi_var || !Spi_var)
|
||||
return;
|
||||
while (PatL)
|
||||
{
|
||||
MyList<Block> *BP = PatL->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
bssn_cuda_escalar_download_fields_if_present(
|
||||
cg, cg->shape,
|
||||
cg->fgfs[Sphi_var->sgfn],
|
||||
cg->fgfs[Spi_var->sgfn]);
|
||||
}
|
||||
if (BP == PatL->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
PatL = PatL->next;
|
||||
}
|
||||
}
|
||||
|
||||
int run_bssn_escalar_cuda_substep(Block *cg,
|
||||
MyList<var> *state_in_list,
|
||||
MyList<var> *state_out_list,
|
||||
@@ -992,8 +1102,8 @@ void bssnEScalar_class::Read_Pablo()
|
||||
|
||||
//================================================================================================
|
||||
|
||||
void bssnEScalar_class::Step(int lev, int YN)
|
||||
{
|
||||
void bssnEScalar_class::Step(int lev, int YN)
|
||||
{
|
||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
#ifdef With_AHF
|
||||
AH_Step_Find(lev, dT_lev);
|
||||
@@ -1003,15 +1113,18 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
if (lev < GH->movls)
|
||||
ndeps = numepsb;
|
||||
double TRK4 = PhysTime;
|
||||
int iter_count = 0; // count RK4 substeps
|
||||
int pre = 0, cor = 1;
|
||||
int ERROR = 0;
|
||||
|
||||
MyList<ss_patch> *sPp;
|
||||
// Predictor
|
||||
MyList<Patch> *Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
int iter_count = 0; // count RK4 substeps
|
||||
int pre = 0, cor = 1;
|
||||
int ERROR = 0;
|
||||
EScalarStepProfile escalar_profile;
|
||||
escalar_profile_init(escalar_profile);
|
||||
|
||||
MyList<ss_patch> *sPp;
|
||||
// Predictor
|
||||
const double escalar_profile_predictor_rhs_start = MPI_Wtime();
|
||||
MyList<Patch> *Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
@@ -1101,6 +1214,8 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
{
|
||||
if (scalar_gpu_rk_done)
|
||||
{
|
||||
if (!escalar_resident_enabled())
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
@@ -1112,6 +1227,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
}
|
||||
|
||||
varl0 = varl0->next;
|
||||
varl = varl->next;
|
||||
@@ -1157,11 +1273,12 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
// check error information
|
||||
{
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
escalar_profile_add(escalar_profile.predictor_rhs, escalar_profile_predictor_rhs_start);
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
@@ -1325,10 +1442,14 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
const double escalar_profile_predictor_sync_start = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||
Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_pre, Symmetry, sync_cache_scalar_pre[lev]);
|
||||
escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
|
||||
#else
|
||||
const double escalar_profile_predictor_sync_start = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||
escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
@@ -1381,21 +1502,28 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
}
|
||||
}
|
||||
}
|
||||
// data analysis part
|
||||
// Warning NOTE: the variables1 are used as temp storege room
|
||||
if (lev == a_lev)
|
||||
{
|
||||
AnalysisStuff_EScalar(lev, dT_lev);
|
||||
}
|
||||
// corrector
|
||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||
{
|
||||
// data analysis part
|
||||
// Warning NOTE: the variables1 are used as temp storege room
|
||||
if (lev == a_lev)
|
||||
{
|
||||
const double escalar_profile_analysis_start = MPI_Wtime();
|
||||
#if USE_CUDA_BSSN
|
||||
if (escalar_resident_enabled())
|
||||
download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi, Spi, myrank);
|
||||
#endif
|
||||
AnalysisStuff_EScalar(lev, dT_lev);
|
||||
escalar_profile_add(escalar_profile.analysis, escalar_profile_analysis_start);
|
||||
}
|
||||
// corrector
|
||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||
{
|
||||
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
|
||||
if (iter_count == 1 || iter_count == 3)
|
||||
TRK4 += dT_lev / 2;
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
if (iter_count == 1 || iter_count == 3)
|
||||
TRK4 += dT_lev / 2;
|
||||
const double escalar_profile_corrector_rhs_start = MPI_Wtime();
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
@@ -1494,6 +1622,8 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
{
|
||||
if (scalar_gpu_rk_done)
|
||||
{
|
||||
if (!escalar_resident_enabled())
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
@@ -1505,6 +1635,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
}
|
||||
|
||||
varl0 = varl0->next;
|
||||
varl = varl->next;
|
||||
@@ -1552,11 +1683,12 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
// check error information
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
escalar_profile_add(escalar_profile.corrector_rhs, escalar_profile_corrector_rhs_start);
|
||||
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
@@ -1731,10 +1863,14 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
const double escalar_profile_corrector_sync_start = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||
Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_cor, Symmetry, sync_cache_scalar_cor[lev]);
|
||||
escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
|
||||
#else
|
||||
const double escalar_profile_corrector_sync_start = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||
escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
@@ -1837,17 +1973,21 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
|
||||
#if (RPS == 0)
|
||||
// mesh refinement boundary part
|
||||
const double escalar_profile_rp_start = MPI_Wtime();
|
||||
#if USE_CUDA_BSSN
|
||||
{
|
||||
const char *mixed_env = getenv("AMSS_ESCALAR_MIXED_GPU_RP");
|
||||
const bool mixed_gpu_rp = (mixed_env && atoi(mixed_env) != 0);
|
||||
const char *split_env = getenv("AMSS_ESCALAR_SPLIT_RP");
|
||||
const bool split_rp = (split_env && atoi(split_env) != 0);
|
||||
if (escalar_resident_enabled() && !split_rp)
|
||||
download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi1, Spi1, myrank);
|
||||
if (!mixed_gpu_rp && !split_rp)
|
||||
download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank);
|
||||
}
|
||||
#endif
|
||||
RestrictProlong(lev, YN, BB);
|
||||
escalar_profile_add(escalar_profile.restrict_prolong, escalar_profile_rp_start);
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -1910,18 +2050,19 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
Porg0[ithBH][0] = Porg1[ithBH][0];
|
||||
Porg0[ithBH][1] = Porg1[ithBH][1];
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
escalar_profile_report(escalar_profile, lev, myrank);
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user