Save Z4C CUDA optimization progress

This commit is contained in:
2026-05-02 00:49:02 +08:00
parent 531b31e8db
commit 383e936e88
6 changed files with 343 additions and 66 deletions

View File

@@ -28,6 +28,9 @@ using namespace std;
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#if USE_CUDA_Z4C && (ABEtype == 2)
#include "z4c_rhs_cuda.h"
#endif
#include "initial_puncture.h"
#include "enforce_algebra.h"
#include "rungekutta4_rout.h"
@@ -36,6 +39,12 @@ using namespace std;
#include "shellfunctions.h"
#include "parameters.h"
#if (ABEtype == 1) || ((ABEtype == 2) && !USE_CUDA_Z4C)
#define AMSS_LEGACY_ABE_TRANSFER 1
#else
#define AMSS_LEGACY_ABE_TRANSFER 0
#endif
#ifdef With_AHF
#include "derivatives.h"
#include "myglobal.h"
@@ -647,6 +656,87 @@ void bssn_cuda_flush_level_before_regrid(MyList<Patch> *PatL,
bssn_cuda_release_level_state(PatL, myrank);
}
#if USE_CUDA_Z4C && (ABEtype == 2)
bool fill_z4c_cuda_views_for_regrid(Block *cg, MyList<var> *vars,
double **host_views)
{
int idx = 0;
while (vars && idx < Z4C_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
vars = vars->next;
++idx;
}
return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
}
void z4c_cuda_download_level_state_if_present_for_regrid(MyList<Patch> *PatL,
MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
{
double *state_out[Z4C_CUDA_STATE_COUNT];
if (!fill_z4c_cuda_views_for_regrid(cg, vars, state_out))
{
cout << "CUDA Z4C state list mismatch on regrid flush" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
{
cout << "CUDA Z4C resident state regrid download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void z4c_cuda_release_level_state_for_regrid(MyList<Patch> *PatL, int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
z4c_cuda_release_step_ctx(cg);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void z4c_cuda_flush_level_before_regrid(MyList<Patch> *PatL,
MyList<var> *corL,
MyList<var> *oldL,
MyList<var> *stateL,
MyList<var> *preL,
int myrank)
{
z4c_cuda_download_level_state_if_present_for_regrid(PatL, corL, myrank);
z4c_cuda_download_level_state_if_present_for_regrid(PatL, oldL, myrank);
z4c_cuda_download_level_state_if_present_for_regrid(PatL, stateL, myrank);
z4c_cuda_download_level_state_if_present_for_regrid(PatL, preL, myrank);
z4c_cuda_release_level_state_for_regrid(PatL, myrank);
}
#endif
bool bssn_cuda_regrid_flush_enabled()
{
static int enabled = -1;
@@ -2969,6 +3059,10 @@ void bssn_class::Evolve(int Steps)
STEP_TIMER_DECL(timer_dump3d);
// misc::tillherecheck("before Dump_Data");
#if USE_CUDA_Z4C && (ABEtype == 2)
for (int lev = 0; lev < GH->levels; lev++)
z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
#endif
for (int lev = 0; lev < GH->levels; lev++)
Parallel::Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
#ifdef WithShell
@@ -2990,6 +3084,10 @@ void bssn_class::Evolve(int Steps)
STEP_TIMER_DECL(timer_dump2d);
// misc::tillherecheck("before 2dDump_Data");
#if USE_CUDA_Z4C && (ABEtype == 2)
for (int lev = 0; lev < GH->levels; lev++)
z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
#endif
for (int lev = 0; lev < GH->levels; lev++)
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
@@ -3018,13 +3116,21 @@ void bssn_class::Evolve(int Steps)
#if (REGLEV == 1)
STEP_TIMER_DECL(timer_regrid);
#if USE_CUDA_BSSN
#if USE_CUDA_BSSN && (ABEtype != 2)
for (int il = 0; il < GH->levels; il++)
if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0))
bssn_cuda_flush_level_before_regrid(GH->PatL[il],
SynchList_cor, OldStateList,
StateList, SynchList_pre,
myrank);
#endif
#if USE_CUDA_Z4C && USE_CUDA_BSSN && (ABEtype == 2)
for (int il = 0; il < GH->levels; il++)
if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0))
z4c_cuda_flush_level_before_regrid(GH->PatL[il],
SynchList_cor, OldStateList,
StateList, SynchList_pre,
myrank);
#endif
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
@@ -3113,6 +3219,10 @@ void bssn_class::Evolve(int Steps)
STEP_TIMER_DECL(timer_checkpoint);
LastCheck = 0;
#if USE_CUDA_Z4C && (ABEtype == 2)
for (int lev = 0; lev < GH->levels; lev++)
z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
#endif
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
CheckPoint->writecheck_cgh(PhysTime, GH);
#ifdef WithShell
@@ -4346,7 +4456,7 @@ void bssn_class::Step(int lev, int YN)
STEP_TIMER_DECL(timer_predictor_sync);
Parallel::AsyncSyncState async_pre;
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#else
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -4369,7 +4479,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
#if (ABEtype != 1 && ABEtype != 2)
#if !AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#endif
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
@@ -4793,7 +4903,7 @@ void bssn_class::Step(int lev, int YN)
STEP_TIMER_DECL(timer_corrector_sync);
Parallel::AsyncSyncState async_cor;
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -4816,7 +4926,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
#if (ABEtype != 1 && ABEtype != 2)
#if !AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#endif
STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
@@ -5312,7 +5422,7 @@ void bssn_class::Step(int lev, int YN)
#endif
Parallel::AsyncSyncState async_pre;
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#else
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -5335,7 +5445,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
#if (ABEtype != 1 && ABEtype != 2)
#if !AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#endif
@@ -5663,7 +5773,7 @@ void bssn_class::Step(int lev, int YN)
#endif
Parallel::AsyncSyncState async_cor;
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -5686,7 +5796,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
#if (ABEtype != 1 && ABEtype != 2)
#if !AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#endif
@@ -6080,7 +6190,7 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
@@ -6285,7 +6395,7 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
@@ -6926,7 +7036,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -6943,7 +7053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -6960,7 +7070,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7001,7 +7111,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
@@ -7018,7 +7128,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7035,7 +7145,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7066,7 +7176,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
}
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
@@ -7124,7 +7234,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
}
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -7134,7 +7244,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7144,7 +7254,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7170,7 +7280,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
else // no time refinement levels and for all same time levels
{
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
@@ -7180,7 +7290,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7190,7 +7300,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7214,7 +7324,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#endif
}
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
@@ -7265,7 +7375,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
}
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -7275,7 +7385,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7285,7 +7395,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7313,7 +7423,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
if (myrank == 0)
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
@@ -7323,7 +7433,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7333,7 +7443,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7357,7 +7467,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif
}
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
@@ -9065,7 +9175,7 @@ void bssn_class::AH_Step_Find(int lev, double dT_lev)
ihn++;
}
#if (ABEtype == 1 || ABEtype == 2)
#if AMSS_LEGACY_ABE_TRANSFER
if (PhysTime > 10)
{
ihn--;