Complete BSSN-EScalar CUDA resident transfers

This commit is contained in:
2026-05-05 23:57:42 +08:00
parent 85fe29cc2e
commit ae64a22178
5 changed files with 995 additions and 72 deletions

View File

@@ -90,6 +90,22 @@ bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analy
return false;
if (lev == analysis_lev)
return false;
static int release_only_level = -2;
if (release_only_level == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_RELEASE_ONLY_LEVEL");
release_only_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
if (release_only_level >= 0)
return lev != release_only_level;
static int keep_level_limit = -2;
if (keep_level_limit == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_LEVELS_BELOW");
keep_level_limit = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
if (keep_level_limit >= 0)
return lev < keep_level_limit;
if (keep_all_levels)
return true;
return lev < trfls_in;
@@ -125,6 +141,138 @@ bool bssn_escalar_timing_enabled()
return enabled != 0;
}
bool bssn_escalar_cuda_post_rp_download_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_DOWNLOAD");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_escalar_cuda_post_rp_download_level_enabled(int lev)
{
if (!bssn_escalar_cuda_post_rp_download_enabled())
return false;
static int min_level = -2;
if (min_level == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_MIN_LEVEL");
min_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
return min_level < 0 || lev >= min_level;
}
bool bssn_escalar_cuda_post_swap_release_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_SWAP_RELEASE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_escalar_cuda_pre_rp_release_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_PRE_RP_RELEASE");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
bool bssn_escalar_cuda_bh_interp_resident_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
bool bssn_escalar_cuda_prune_after_swap_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_PRUNE_AFTER_SWAP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_cuda_upload_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_in))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during upload" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_upload_resident_state(cg, cg->shape, state_in))
{
cout << "CUDA BSSN-EScalar resident state upload failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_escalar_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_key[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_key))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during prune" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_keep_only_resident_state(cg, cg->shape, state_key))
{
cout << "CUDA BSSN-EScalar resident state prune failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
double sync, double bh, double analysis, double swap,
double resident, double rp)
@@ -1244,7 +1392,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false);
#endif
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
@@ -1670,7 +1819,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
#endif
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
@@ -1760,7 +1910,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank,
bssn_escalar_cuda_pre_rp_release_enabled());
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
@@ -1833,9 +1984,28 @@ void bssnEScalar_class::Step(int lev, int YN)
sPp = sPp->next;
}
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
#endif
#if USE_CUDA_BSSN
bool release_after_sync = false;
if (use_cuda_resident_sync && bssn_escalar_cuda_post_rp_download_level_enabled(lev))
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
release_after_sync = bssn_escalar_cuda_post_swap_release_enabled();
bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, release_after_sync);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
if (use_cuda_resident_sync && !release_after_sync &&
bssn_escalar_cuda_prune_after_swap_enabled())
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
bssn_escalar_cuda_keep_only_level_state(GH->PatL[lev], StateList, myrank);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{