From b1974ef146d228d020453c2b5913052d8a31e668 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 30 Apr 2026 20:01:18 +0800 Subject: [PATCH] Stabilize device AMR restrict across regrid --- AMSS_NCKU_source/bssn_class.C | 192 ++++++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_rhs_cuda.cu | 51 ++++++++ AMSS_NCKU_source/bssn_rhs_cuda.h | 4 + 3 files changed, 247 insertions(+) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index ce1070a..04576c3 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -583,6 +583,148 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int } } +void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList *vars, int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_out[BSSN_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views(cg, vars, state_out)) + { + cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out)) + { + cout << "CUDA resident state conditional download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void bssn_cuda_release_level_state(MyList *PatL, int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + bssn_cuda_release_step_ctx(cg); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void bssn_cuda_flush_level_before_regrid(MyList *PatL, + MyList *corL, + MyList *oldL, + MyList *stateL, + MyList *preL, + int myrank) +{ + bssn_cuda_download_level_state_if_present(PatL, corL, myrank); + bssn_cuda_download_level_state_if_present(PatL, oldL, myrank); + bssn_cuda_download_level_state_if_present(PatL, stateL, myrank); + bssn_cuda_download_level_state_if_present(PatL, preL, myrank); + bssn_cuda_release_level_state(PatL, myrank); +} + +bool bssn_cuda_regrid_flush_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_DEVICE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) +{ + if (!GH || lev < GH->movls || lev >= GH->levels || !GH->PatL[lev]) + return false; + if (!GH->PatL[lev]->data || !GH->PatL[lev]->data->blb || !GH->PatL[lev]->data->blb->data) + return true; + + const int do_every = 2; + const double dX = GH->PatL[lev]->data->blb->data->getdX(0); + const double dY = GH->PatL[lev]->data->blb->data->getdX(1); + const double dZ = GH->PatL[lev]->data->blb->data->getdX(2); + + for (int grd = 0; grd < GH->grids[lev]; grd++) + { + int bhi = 0; + for (bhi = 0; bhi < BH_num; bhi++) + { + if (feq(GH->Porgls[lev][bhi][0], GH->handle[lev][grd][0], 2 * do_every * dX) && + feq(GH->Porgls[lev][bhi][1], GH->handle[lev][grd][1], 2 * do_every * dY) && + feq(GH->Porgls[lev][bhi][2], GH->handle[lev][grd][2], 2 * do_every * dZ)) + break; + } + if (bhi == BH_num) + { + if (feq(0, GH->bbox[lev][grd][0], dX / 2) && + feq(0, GH->bbox[lev][grd][1], dY / 2) && + feq(0, GH->bbox[lev][grd][2], dZ / 2)) + continue; + if (BH_num == 1) + bhi = 0; + else + return true; + } + + double rr = (Porg0[bhi][0] - GH->handle[lev][grd][0]) / dX; + int flag = (rr > 0) ? int(rr + 0.5) / do_every : int(rr - 0.5) / do_every; + rr = flag * do_every * dX; + if (Symmetry == 2 && GH->bbox[lev][grd][0] + rr < 0) + rr = -GH->bbox[lev][grd][0]; + if (fabs(rr) > dX / 2) + return true; + + rr = (Porg0[bhi][1] - GH->handle[lev][grd][1]) / dY; + flag = (rr > 0) ? int(rr + 0.5) / do_every : int(rr - 0.5) / do_every; + rr = flag * do_every * dY; + if (Symmetry == 2 && GH->bbox[lev][grd][1] + rr < 0) + rr = -GH->bbox[lev][grd][1]; + if (fabs(rr) > dY / 2) + return true; + + rr = (Porg0[bhi][2] - GH->handle[lev][grd][2]) / dZ; + flag = (rr > 0) ? int(rr + 0.5) / do_every : int(rr - 0.5) / do_every; + rr = flag * do_every * dZ; + if (Symmetry > 0 && GH->bbox[lev][grd][2] + rr < 0) + rr = -GH->bbox[lev][grd][1]; + if (fabs(rr) > dZ / 2) + return true; + } + return false; +} + +bool bssn_cuda_should_flush_before_regrid(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) +{ + return bssn_cuda_regrid_flush_enabled() && + bssn_cuda_will_regrid_onelevel(GH, lev, Symmetry, BH_num, Porg0); +} + void bssn_cuda_sync_level_bh_fields(MyList *PatL, int myrank, var *forx, var *fory, var *forz) @@ -2839,6 +2981,14 @@ void bssn_class::Evolve(int Steps) #if (REGLEV == 1) STEP_TIMER_DECL(timer_regrid); +#if USE_CUDA_BSSN + for (int il = 0; il < GH->levels; il++) + if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[il], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); @@ -3075,6 +3225,13 @@ void bssn_class::RecursiveStep(int lev) #if (REGLEV == 0) STEP_TIMER_DECL(timer_regrid_onelevel); +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[lev], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -3261,6 +3418,13 @@ void bssn_class::ParallelStep() delete[] tporg; delete[] tporgo; #if (REGLEV == 0) +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, GH->mylev, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[GH->mylev], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -3432,6 +3596,13 @@ void bssn_class::ParallelStep() if (lev + 1 >= GH->movls) { // GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0, +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, lev + 1, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[lev + 1], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) @@ -3451,6 +3622,13 @@ void bssn_class::ParallelStep() // for this level if (YN == 1) { +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[lev], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -3474,6 +3652,13 @@ void bssn_class::ParallelStep() if (YN == 1) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, lev - 1, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[lev - 1], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) @@ -3494,6 +3679,13 @@ void bssn_class::ParallelStep() if (i % 4 == 3) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, +#if USE_CUDA_BSSN + if (bssn_cuda_should_flush_before_regrid(GH, lev - 1, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[lev - 1], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index c61df51..378c79e 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -6404,6 +6404,45 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } } +static bool download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out) +{ + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return false; + + StepContext &ctx = it->second; + const int bank = find_resident_bank(ctx, state_host_out); + if (bank < 0 || !ctx.resident_valid[bank]) + return false; + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + mark_resident_current_bank(ctx, bank); + if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + return true; + + static int direct_download = -1; + if (direct_download < 0) { + const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD"); + direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + if (direct_download) { + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); + } else { + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } + } + set_resident_host_clean(ctx, bank, true); + return true; +} + static void copy_state_subset(void *block_tag, int *ex, int subset_count, @@ -7056,6 +7095,18 @@ int bssn_cuda_download_resident_state(void *block_tag, return 0; } +extern "C" +int bssn_cuda_download_resident_state_if_present(void *block_tag, + int *ex, + double **state_host_out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!block_tag || !ex || !state_host_out) return 1; + download_resident_state_if_present(block_tag, ex, state_host_out); + return 0; +} + extern "C" int bssn_cuda_download_constraint_outputs(int *ex, double **constraint_host_out) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 66ce74c..41e92f8 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -73,6 +73,10 @@ int bssn_cuda_download_resident_state(void *block_tag, int *ex, double **state_host_out); +int bssn_cuda_download_resident_state_if_present(void *block_tag, + int *ex, + double **state_host_out); + int bssn_cuda_download_constraint_outputs(int *ex, double **constraint_host_out);