From 383e936e8893d0cd4c742335ff07f4916ead3522 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 2 May 2026 00:49:02 +0800 Subject: [PATCH] Save Z4C CUDA optimization progress --- AMSS_NCKU_source/Parallel.C | 2 +- AMSS_NCKU_source/Z4c_class.C | 188 ++++++++++++++++++++++++++----- AMSS_NCKU_source/bssn_class.C | 176 +++++++++++++++++++++++------ AMSS_NCKU_source/z4c_rhs_cuda.cu | 31 ++++- AMSS_NCKU_source/z4c_rhs_cuda.h | 8 ++ makefile_and_run.py | 4 + 6 files changed, 343 insertions(+), 66 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 7dc3a34..a0a7434 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -335,7 +335,7 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg if (z4c_amr_device < 0) { const char *env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE"); - z4c_amr_device = (env && atoi(env) != 0) ? 1 : 0; + z4c_amr_device = (!env || atoi(env) != 0) ? 1 : 0; } if (!z4c_amr_device) return false; diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index b75ed73..7585cc2 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -228,7 +228,13 @@ bool z4c_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) if (enabled < 0) { const char *env = getenv("AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"); - enabled = (env && atoi(env) != 0) ? 1 : 0; + if (env) + enabled = (atoi(env) != 0) ? 1 : 0; + else + { + env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } } if (!enabled) return false; @@ -478,6 +484,89 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH, return true; } +bool z4c_cuda_download_bh_shift_level(MyList *PatL, + int myrank, + var *forx, var *fory, var *forz) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) + { + double *fields[3] = { + cg->fgfs[forx->sgfn], + cg->fgfs[fory->sgfn], + cg->fgfs[forz->sgfn]}; + if (z4c_cuda_download_state_subset(cg, cg->shape, 3, + k_z4c_cuda_bh_state_indices, + fields)) + return false; + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + return true; +} + +bool z4c_cuda_refresh_constraint_level(MyList *PatL, + int myrank, + var *Cons_Ham, var *Cons_Px, + var *Cons_Py, var *Cons_Pz, + var *Cons_Gx, var *Cons_Gy, + var *Cons_Gz, var *TZ0, + int Symmetry, int lev, double eps) +{ + bool all_resident = true; + const int tz_index = 24; + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + if (!z4c_cuda_has_resident_state(cg)) + { + all_resident = false; + } + else + { + double *constraints[7] = { + cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], + cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], + cg->fgfs[Cons_Gz->sgfn]}; + double *tz_out[1] = {cg->fgfs[TZ0->sgfn]}; + int co = 0; + if (z4c_cuda_compute_constraints_resident(cg, cg->shape, + cg->X[0], cg->X[1], cg->X[2], + Symmetry, eps, co, + constraints) || + z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out)) + { + cout << "CUDA Z4C resident constraint refresh failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + return all_resident; +} + } // namespace #endif @@ -496,6 +585,33 @@ void Z4c_class::Step(int lev, int YN) int iter_count = 0; int pre = 0, cor = 1; int ERROR = 0; + const double dT_mon = dT * pow(0.5, Mymax(0, trfls)); + const bool need_constraint_after_step = (LastConsOut + dT_mon >= AnasTime); + + if (BH_num > 0 && lev == GH->levels - 1) + { + if (!z4c_cuda_download_bh_shift_level(GH->PatL[lev], myrank, Sfx0, Sfy0, Sfz0)) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed to download predictor black-hole shift at t = " + << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg[ithBH][2] = fabs(Porg[ithBH][2]); + if (Symmetry == 2) + { + Porg[ithBH][0] = fabs(Porg[ithBH][0]); + Porg[ithBH][1] = fabs(Porg[ithBH][1]); + } + } + } MyList *Pp = GH->PatL[lev]; while (Pp) @@ -565,24 +681,6 @@ void Z4c_class::Step(int lev, int YN) Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); - f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); - if (Symmetry > 0) - Porg[ithBH][2] = fabs(Porg[ithBH][2]); - if (Symmetry == 2) - { - Porg[ithBH][0] = fabs(Porg[ithBH][0]); - Porg[ithBH][1] = fabs(Porg[ithBH][1]); - } - } - } - if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)) z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); if (lev == a_lev) @@ -640,6 +738,25 @@ void Z4c_class::Step(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (!ERROR && iter_count == 3 && need_constraint_after_step) + { + double *constraints[7] = { + cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], + cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], + cg->fgfs[Cons_Gz->sgfn]}; + double *tz_out[1] = {cg->fgfs[TZ0->sgfn]}; + const int tz_index = 24; + if (z4c_cuda_download_constraint_outputs(cg->shape, constraints) || + z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out)) + { + cout << "CUDA Z4C constraint download failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } } if (BP == Pp->data->ble) break; @@ -719,7 +836,10 @@ void Z4c_class::Step(int lev, int YN) { const bool keep_resident = z4c_cuda_keep_resident_after_step(lev, trfls, a_lev); - z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident); + const bool need_host_after_step = + ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)); + if (!keep_resident || need_host_after_step) + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident); } #if (RPS == 0) @@ -2991,17 +3111,23 @@ void Z4c_class::Check_extrop() //================================================================================================ -void Z4c_class::Constraint_Out() -{ - // here we have to use the same variable name as in the parent class - LastConsOut += dT * pow(0.5, Mymax(0, trfls)); - - if (LastConsOut >= AnasTime) - // Constraint violation - { - // recompute least the constraint data lost for moved new grid - for (int lev = 0; lev < GH->levels; lev++) - { +void Z4c_class::Constraint_Out() +{ + // here we have to use the same variable name as in the parent class + LastConsOut += dT * pow(0.5, Mymax(0, trfls)); + + if (LastConsOut >= AnasTime) + // Constraint violation + { +#if USE_CUDA_Z4C && (ABEtype == 2) + bool cuda_constraints_ready = true; +#else + const bool cuda_constraints_ready = false; +#endif + // recompute least the constraint data lost for moved new grid + if (!cuda_constraints_ready) + for (int lev = 0; lev < GH->levels; lev++) + { // make sure the data consistent for higher levels if (lev > 0) { diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index b6aaf67..c4164b3 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -28,6 +28,9 @@ using namespace std; #if USE_CUDA_BSSN #include "bssn_rhs_cuda.h" #endif +#if USE_CUDA_Z4C && (ABEtype == 2) +#include "z4c_rhs_cuda.h" +#endif #include "initial_puncture.h" #include "enforce_algebra.h" #include "rungekutta4_rout.h" @@ -36,6 +39,12 @@ using namespace std; #include "shellfunctions.h" #include "parameters.h" +#if (ABEtype == 1) || ((ABEtype == 2) && !USE_CUDA_Z4C) +#define AMSS_LEGACY_ABE_TRANSFER 1 +#else +#define AMSS_LEGACY_ABE_TRANSFER 0 +#endif + #ifdef With_AHF #include "derivatives.h" #include "myglobal.h" @@ -647,6 +656,87 @@ void bssn_cuda_flush_level_before_regrid(MyList *PatL, bssn_cuda_release_level_state(PatL, myrank); } +#if USE_CUDA_Z4C && (ABEtype == 2) +bool fill_z4c_cuda_views_for_regrid(Block *cg, MyList *vars, + double **host_views) +{ + int idx = 0; + while (vars && idx < Z4C_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + vars = vars->next; + ++idx; + } + return idx == Z4C_CUDA_STATE_COUNT && vars == 0; +} + +void z4c_cuda_download_level_state_if_present_for_regrid(MyList *PatL, + MyList *vars, + int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) + { + double *state_out[Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views_for_regrid(cg, vars, state_out)) + { + cout << "CUDA Z4C state list mismatch on regrid flush" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (z4c_cuda_download_resident_state(cg, cg->shape, state_out)) + { + cout << "CUDA Z4C resident state regrid download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void z4c_cuda_release_level_state_for_regrid(MyList *PatL, int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) + z4c_cuda_release_step_ctx(cg); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void z4c_cuda_flush_level_before_regrid(MyList *PatL, + MyList *corL, + MyList *oldL, + MyList *stateL, + MyList *preL, + int myrank) +{ + z4c_cuda_download_level_state_if_present_for_regrid(PatL, corL, myrank); + z4c_cuda_download_level_state_if_present_for_regrid(PatL, oldL, myrank); + z4c_cuda_download_level_state_if_present_for_regrid(PatL, stateL, myrank); + z4c_cuda_download_level_state_if_present_for_regrid(PatL, preL, myrank); + z4c_cuda_release_level_state_for_regrid(PatL, myrank); +} +#endif + bool bssn_cuda_regrid_flush_enabled() { static int enabled = -1; @@ -2969,6 +3059,10 @@ void bssn_class::Evolve(int Steps) STEP_TIMER_DECL(timer_dump3d); // misc::tillherecheck("before Dump_Data"); +#if USE_CUDA_Z4C && (ABEtype == 2) + for (int lev = 0; lev < GH->levels; lev++) + z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank); +#endif for (int lev = 0; lev < GH->levels; lev++) Parallel::Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); #ifdef WithShell @@ -2990,6 +3084,10 @@ void bssn_class::Evolve(int Steps) STEP_TIMER_DECL(timer_dump2d); // misc::tillherecheck("before 2dDump_Data"); +#if USE_CUDA_Z4C && (ABEtype == 2) + for (int lev = 0; lev < GH->levels; lev++) + z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank); +#endif for (int lev = 0; lev < GH->levels; lev++) Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d); @@ -3018,13 +3116,21 @@ void bssn_class::Evolve(int Steps) #if (REGLEV == 1) STEP_TIMER_DECL(timer_regrid); -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN && (ABEtype != 2) for (int il = 0; il < GH->levels; il++) if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) bssn_cuda_flush_level_before_regrid(GH->PatL[il], SynchList_cor, OldStateList, StateList, SynchList_pre, myrank); +#endif +#if USE_CUDA_Z4C && USE_CUDA_BSSN && (ABEtype == 2) + for (int il = 0; il < GH->levels; il++) + if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) + z4c_cuda_flush_level_before_regrid(GH->PatL[il], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); #endif GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, @@ -3113,6 +3219,10 @@ void bssn_class::Evolve(int Steps) STEP_TIMER_DECL(timer_checkpoint); LastCheck = 0; +#if USE_CUDA_Z4C && (ABEtype == 2) + for (int lev = 0; lev < GH->levels; lev++) + z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank); +#endif CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); CheckPoint->writecheck_cgh(PhysTime, GH); #ifdef WithShell @@ -4346,7 +4456,7 @@ void bssn_class::Step(int lev, int YN) STEP_TIMER_DECL(timer_predictor_sync); Parallel::AsyncSyncState async_pre; -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); #else Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); @@ -4369,7 +4479,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if !AMSS_LEGACY_ABE_TRANSFER Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); #endif STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync); @@ -4793,7 +4903,7 @@ void bssn_class::Step(int lev, int YN) STEP_TIMER_DECL(timer_corrector_sync); Parallel::AsyncSyncState async_cor; -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); #else Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); @@ -4816,7 +4926,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if !AMSS_LEGACY_ABE_TRANSFER Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); #endif STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync); @@ -5312,7 +5422,7 @@ void bssn_class::Step(int lev, int YN) #endif Parallel::AsyncSyncState async_pre; -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); #else Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); @@ -5335,7 +5445,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if !AMSS_LEGACY_ABE_TRANSFER Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); #endif @@ -5663,7 +5773,7 @@ void bssn_class::Step(int lev, int YN) #endif Parallel::AsyncSyncState async_cor; -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); #else Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); @@ -5686,7 +5796,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1 && ABEtype != 2) +#if !AMSS_LEGACY_ABE_TRANSFER Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); #endif @@ -6080,7 +6190,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); @@ -6285,7 +6395,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); @@ -6926,7 +7036,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); @@ -6943,7 +7053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -6960,7 +7070,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7001,7 +7111,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); @@ -7018,7 +7128,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -7035,7 +7145,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7066,7 +7176,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif } -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SL, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); @@ -7124,7 +7234,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, } #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); @@ -7134,7 +7244,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -7144,7 +7254,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7170,7 +7280,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, else // no time refinement levels and for all same time levels { #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); @@ -7180,7 +7290,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -7190,7 +7300,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7214,7 +7324,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #endif } -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SL, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); @@ -7265,7 +7375,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) } #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); @@ -7275,7 +7385,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -7285,7 +7395,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7313,7 +7423,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) if (myrank == 0) cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; #if (RPB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); @@ -7323,7 +7433,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); #endif -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) @@ -7333,7 +7443,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Ppc = GH->PatL[lev - 1]; while (Ppc) { @@ -7357,7 +7467,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #endif } -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); @@ -9065,7 +9175,7 @@ void bssn_class::AH_Step_Find(int lev, double dT_lev) ihn++; } -#if (ABEtype == 1 || ABEtype == 2) +#if AMSS_LEGACY_ABE_TRANSFER if (PhysTime > 10) { ihn--; diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 554d8e4..e410cbf 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -7788,7 +7788,7 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, } double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (!use_resident_state || RK4 == 0 || !ctx.state_ready) { + if (!use_resident_state || !ctx.state_ready) { upload_state_inputs(state_host_in, all); } if (apply_enforce_ga) { @@ -8117,6 +8117,35 @@ extern "C" int z4c_cuda_upload_state_subset(void *block_tag, return 0; } +extern "C" int z4c_cuda_compute_constraints_resident(void *block_tag, + int *ex, double *X, double *Y, double *Z, + int Symmetry, double eps, int co, + double **constraint_host_out) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!block_tag || !ex || !constraint_host_out) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + if (!ctx.state_ready) return 1; + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + bind_state_input_slots(ctx.d_state_curr); + launch_z4c_rhs_pipeline((int)((size_t)ex[0] * ex[1] * ex[2]), eps); + download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" int z4c_cuda_download_constraint_outputs(int *ex, + double **constraint_host_out) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!ex || !constraint_host_out) return 1; + download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int z4c_cuda_has_resident_state(void *block_tag) { using namespace z4c_cuda; diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index 1d958ff..0adea53 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -133,6 +133,14 @@ int z4c_cuda_upload_state_subset(void *block_tag, const int *state_indices, double **state_host_in); +int z4c_cuda_compute_constraints_resident(void *block_tag, + int *ex, double *X, double *Y, double *Z, + int Symmetry, double eps, int co, + double **constraint_host_out); + +int z4c_cuda_download_constraint_outputs(int *ex, + double **constraint_host_out); + int z4c_cuda_has_resident_state(void *block_tag); void z4c_cuda_release_step_ctx(void *block_tag); diff --git a/makefile_and_run.py b/makefile_and_run.py index ed77004..98114c7 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -145,7 +145,9 @@ def _gpu_runtime_env(): "AMSS_ANALYSIS_MAP_EVERY": "1000000", "AMSS_CUDA_AWARE_MPI": "1", "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1", + "AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP": "1", "AMSS_CUDA_KEEP_ALL_LEVELS": "1", + "AMSS_CUDA_Z4C_AMR_DEVICE": "1", "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", @@ -276,7 +278,9 @@ def run_ABE(): print(f" AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}") print(f" AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}") print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}") + print(f" AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP', '')}") print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}") + print(f" AMSS_CUDA_Z4C_AMR_DEVICE={mpi_env.get('AMSS_CUDA_Z4C_AMR_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}") print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")