Stabilize EScalar CUDA fallback path

This commit is contained in:
2026-05-03 16:05:47 +08:00
parent 4430d04ee7
commit e4c10eca0f
5 changed files with 1542 additions and 127 deletions

View File

@@ -140,6 +140,88 @@ bool escalar_gpu_rk_enabled()
return enabled != 0;
}
bool escalar_resident_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_RESIDENT");
const char *experimental = getenv("AMSS_ESCALAR_RESIDENT_EXPERIMENTAL");
enabled = (env && atoi(env) != 0 &&
experimental && atoi(experimental) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool escalar_step_profile_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
int escalar_step_profile_every()
{
static int every = -1;
if (every < 0)
{
const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE_EVERY");
every = (env && atoi(env) > 0) ? atoi(env) : 1;
}
return every;
}
struct EScalarStepProfile
{
double start;
double predictor_rhs;
double predictor_sync;
double analysis;
double corrector_rhs;
double corrector_sync;
double restrict_prolong;
double other_sync;
};
void escalar_profile_init(EScalarStepProfile &p)
{
p.start = MPI_Wtime();
p.predictor_rhs = 0.0;
p.predictor_sync = 0.0;
p.analysis = 0.0;
p.corrector_rhs = 0.0;
p.corrector_sync = 0.0;
p.restrict_prolong = 0.0;
p.other_sync = 0.0;
}
void escalar_profile_add(double &bucket, double t0)
{
bucket += MPI_Wtime() - t0;
}
void escalar_profile_report(const EScalarStepProfile &p, int lev, int myrank)
{
if (myrank != 0 || !escalar_step_profile_enabled())
return;
static long long call_count = 0;
++call_count;
const int every = escalar_step_profile_every();
if (every > 1 && (call_count % every) != 0)
return;
const double total = MPI_Wtime() - p.start;
fprintf(stderr,
"[AMSS-ESCALAR-PROFILE] call=%lld lev=%d total=%.6f pred_rhs=%.6f pred_sync=%.6f analysis=%.6f corr_rhs=%.6f corr_sync=%.6f rp=%.6f other_sync=%.6f\n",
call_count, lev, total, p.predictor_rhs, p.predictor_sync,
p.analysis, p.corrector_rhs, p.corrector_sync,
p.restrict_prolong, p.other_sync);
fflush(stderr);
}
void clear_var_list(MyList<var> *&list)
{
if (list)
@@ -173,6 +255,34 @@ void download_bssn_cuda_prefix_if_present(MyList<Patch> *PatL,
}
}
void download_escalar_cuda_pair_if_present(MyList<Patch> *PatL,
var *Sphi_var,
var *Spi_var,
int myrank)
{
if (!Sphi_var || !Spi_var)
return;
while (PatL)
{
MyList<Block> *BP = PatL->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
bssn_cuda_escalar_download_fields_if_present(
cg, cg->shape,
cg->fgfs[Sphi_var->sgfn],
cg->fgfs[Spi_var->sgfn]);
}
if (BP == PatL->data->ble)
break;
BP = BP->next;
}
PatL = PatL->next;
}
}
int run_bssn_escalar_cuda_substep(Block *cg,
MyList<var> *state_in_list,
MyList<var> *state_out_list,
@@ -992,8 +1102,8 @@ void bssnEScalar_class::Read_Pablo()
//================================================================================================
void bssnEScalar_class::Step(int lev, int YN)
{
void bssnEScalar_class::Step(int lev, int YN)
{
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
@@ -1003,15 +1113,18 @@ void bssnEScalar_class::Step(int lev, int YN)
if (lev < GH->movls)
ndeps = numepsb;
double TRK4 = PhysTime;
int iter_count = 0; // count RK4 substeps
int pre = 0, cor = 1;
int ERROR = 0;
MyList<ss_patch> *sPp;
// Predictor
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
int iter_count = 0; // count RK4 substeps
int pre = 0, cor = 1;
int ERROR = 0;
EScalarStepProfile escalar_profile;
escalar_profile_init(escalar_profile);
MyList<ss_patch> *sPp;
// Predictor
const double escalar_profile_predictor_rhs_start = MPI_Wtime();
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
@@ -1101,6 +1214,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{
if (scalar_gpu_rk_done)
{
if (!escalar_resident_enabled())
{
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
@@ -1112,6 +1227,7 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
}
varl0 = varl0->next;
varl = varl->next;
@@ -1157,11 +1273,12 @@ void bssnEScalar_class::Step(int lev, int YN)
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
// check error information
{
}
Pp = Pp->next;
}
escalar_profile_add(escalar_profile.predictor_rhs, escalar_profile_predictor_rhs_start);
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
@@ -1325,10 +1442,14 @@ void bssnEScalar_class::Step(int lev, int YN)
#endif
#if USE_CUDA_BSSN
const double escalar_profile_predictor_sync_start = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_pre, Symmetry, sync_cache_pre[lev]);
Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_pre, Symmetry, sync_cache_scalar_pre[lev]);
escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
#else
const double escalar_profile_predictor_sync_start = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
#endif
#ifdef WithShell
@@ -1381,21 +1502,28 @@ void bssnEScalar_class::Step(int lev, int YN)
}
}
}
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev)
{
AnalysisStuff_EScalar(lev, dT_lev);
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev)
{
const double escalar_profile_analysis_start = MPI_Wtime();
#if USE_CUDA_BSSN
if (escalar_resident_enabled())
download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi, Spi, myrank);
#endif
AnalysisStuff_EScalar(lev, dT_lev);
escalar_profile_add(escalar_profile.analysis, escalar_profile_analysis_start);
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
const double escalar_profile_corrector_rhs_start = MPI_Wtime();
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
@@ -1494,6 +1622,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{
if (scalar_gpu_rk_done)
{
if (!escalar_resident_enabled())
{
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
@@ -1505,6 +1635,7 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
}
varl0 = varl0->next;
varl = varl->next;
@@ -1552,11 +1683,12 @@ void bssnEScalar_class::Step(int lev, int YN)
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
// check error information
}
Pp = Pp->next;
}
escalar_profile_add(escalar_profile.corrector_rhs, escalar_profile_corrector_rhs_start);
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
@@ -1731,10 +1863,14 @@ void bssnEScalar_class::Step(int lev, int YN)
#endif
#if USE_CUDA_BSSN
const double escalar_profile_corrector_sync_start = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_cor, Symmetry, sync_cache_cor[lev]);
Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_cor, Symmetry, sync_cache_scalar_cor[lev]);
escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
#else
const double escalar_profile_corrector_sync_start = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
#endif
#ifdef WithShell
@@ -1837,17 +1973,21 @@ void bssnEScalar_class::Step(int lev, int YN)
#if (RPS == 0)
// mesh refinement boundary part
const double escalar_profile_rp_start = MPI_Wtime();
#if USE_CUDA_BSSN
{
const char *mixed_env = getenv("AMSS_ESCALAR_MIXED_GPU_RP");
const bool mixed_gpu_rp = (mixed_env && atoi(mixed_env) != 0);
const char *split_env = getenv("AMSS_ESCALAR_SPLIT_RP");
const bool split_rp = (split_env && atoi(split_env) != 0);
if (escalar_resident_enabled() && !split_rp)
download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi1, Spi1, myrank);
if (!mixed_gpu_rp && !split_rp)
download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank);
}
#endif
RestrictProlong(lev, YN, BB);
escalar_profile_add(escalar_profile.restrict_prolong, escalar_profile_rp_start);
#ifdef WithShell
if (lev == 0)
@@ -1910,18 +2050,19 @@ void bssnEScalar_class::Step(int lev, int YN)
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
}
//================================================================================================
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
escalar_profile_report(escalar_profile, lev, myrank);
}
//================================================================================================