Save Z4C CUDA optimization progress

This commit is contained in:
2026-05-02 00:49:02 +08:00
parent 531b31e8db
commit 383e936e88
6 changed files with 343 additions and 66 deletions

View File

@@ -228,7 +228,13 @@ bool z4c_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
if (env)
enabled = (atoi(env) != 0) ? 1 : 0;
else
{
env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
}
if (!enabled)
return false;
@@ -478,6 +484,89 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
return true;
}
bool z4c_cuda_download_bh_shift_level(MyList<Patch> *PatL,
int myrank,
var *forx, var *fory, var *forz)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
{
double *fields[3] = {
cg->fgfs[forx->sgfn],
cg->fgfs[fory->sgfn],
cg->fgfs[forz->sgfn]};
if (z4c_cuda_download_state_subset(cg, cg->shape, 3,
k_z4c_cuda_bh_state_indices,
fields))
return false;
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
return true;
}
bool z4c_cuda_refresh_constraint_level(MyList<Patch> *PatL,
int myrank,
var *Cons_Ham, var *Cons_Px,
var *Cons_Py, var *Cons_Pz,
var *Cons_Gx, var *Cons_Gy,
var *Cons_Gz, var *TZ0,
int Symmetry, int lev, double eps)
{
bool all_resident = true;
const int tz_index = 24;
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
if (!z4c_cuda_has_resident_state(cg))
{
all_resident = false;
}
else
{
double *constraints[7] = {
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
cg->fgfs[Cons_Gz->sgfn]};
double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
int co = 0;
if (z4c_cuda_compute_constraints_resident(cg, cg->shape,
cg->X[0], cg->X[1], cg->X[2],
Symmetry, eps, co,
constraints) ||
z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
{
cout << "CUDA Z4C resident constraint refresh failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
return all_resident;
}
} // namespace
#endif
@@ -496,6 +585,33 @@ void Z4c_class::Step(int lev, int YN)
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
const double dT_mon = dT * pow(0.5, Mymax(0, trfls));
const bool need_constraint_after_step = (LastConsOut + dT_mon >= AnasTime);
if (BH_num > 0 && lev == GH->levels - 1)
{
if (!z4c_cuda_download_bh_shift_level(GH->PatL[lev], myrank, Sfx0, Sfy0, Sfz0))
{
if (myrank == 0 && ErrorMonitor->outfile)
ErrorMonitor->outfile << "CUDA Z4C failed to download predictor black-hole shift at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
@@ -565,24 +681,6 @@ void Z4c_class::Step(int lev, int YN)
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
if (lev == a_lev)
@@ -640,6 +738,25 @@ void Z4c_class::Step(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (!ERROR && iter_count == 3 && need_constraint_after_step)
{
double *constraints[7] = {
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
cg->fgfs[Cons_Gz->sgfn]};
double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
const int tz_index = 24;
if (z4c_cuda_download_constraint_outputs(cg->shape, constraints) ||
z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
{
cout << "CUDA Z4C constraint download failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
}
if (BP == Pp->data->ble)
break;
@@ -719,7 +836,10 @@ void Z4c_class::Step(int lev, int YN)
{
const bool keep_resident = z4c_cuda_keep_resident_after_step(lev, trfls, a_lev);
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident);
const bool need_host_after_step =
((lev == a_lev) && (LastAnas + dT_lev >= AnasTime));
if (!keep_resident || need_host_after_step)
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident);
}
#if (RPS == 0)
@@ -2991,17 +3111,23 @@ void Z4c_class::Check_extrop()
//================================================================================================
void Z4c_class::Constraint_Out()
{
// here we have to use the same variable name as in the parent class
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime)
// Constraint violation
{
// recompute least the constraint data lost for moved new grid
for (int lev = 0; lev < GH->levels; lev++)
{
void Z4c_class::Constraint_Out()
{
// here we have to use the same variable name as in the parent class
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime)
// Constraint violation
{
#if USE_CUDA_Z4C && (ABEtype == 2)
bool cuda_constraints_ready = true;
#else
const bool cuda_constraints_ready = false;
#endif
// recompute least the constraint data lost for moved new grid
if (!cuda_constraints_ready)
for (int lev = 0; lev < GH->levels; lev++)
{
// make sure the data consistent for higher levels
if (lev > 0)
{