From 8fe60ea703eca7b1c6252fea05363c7fb6382b4f Mon Sep 17 00:00:00 2001 From: ianchb Date: Wed, 15 Apr 2026 00:25:53 +0800 Subject: [PATCH] Add zero matter handling and interpolation for resident state in CUDA BSSN --- AMSS_NCKU_source/bssn_class.C | 202 ++++++++++++++++++++++++++++-- AMSS_NCKU_source/bssn_rhs_cuda.cu | 17 ++- AMSS_NCKU_source/bssn_rhs_cuda.h | 1 + 3 files changed, 209 insertions(+), 11 deletions(-) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 326d8f3..c98bfd8 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload); } +bool bssn_cuda_patch_contains_point(Patch *patch, const double *point) +{ + if (!patch) + return false; + for (int d = 0; d < dim; d++) + { + const double h = patch->getdX(d); + const double lo = patch->bbox[d] + patch->lli[d] * h; + const double hi = patch->bbox[dim + d] - patch->uui[d] * h; + if (point[d] < lo || point[d] > hi) + return false; + } + return true; +} + +bool bssn_cuda_point_in_block(Patch *patch, Block *block, + const double *point, const double *DH) +{ + if (!patch || !block) + return false; + for (int d = 0; d < dim; d++) + { + double llb; + double uub; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + (ghost_width - 0.5) * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d]; +#else +#ifdef Cell + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + ghost_width * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - ghost_width * DH[d]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2) + return false; + } + return true; +} + +int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn) +{ + if (!coords || n <= ordn) + return 0; + int cxi = int((x - coords[0]) / dx + 0.4) + 1; + int start = cxi - ordn / 2; + if (start < 0) + start = 0; + const int max_start = n - ordn; + if (start > max_start) + start = max_start; + return start; +} + +bool bssn_cuda_interp_bh_point_resident(MyList *PatL, + int myrank, + const double *point, + var *forx, var *fory, var *forz, + int Symmetry, + double *shellf) +{ + const int ordn = 2 * ghost_width; + int owner_rank = -1; + + shellf[0] = shellf[1] = shellf[2] = 0.0; + + MyList *PL = PatL; + while (PL) + { + Patch *patch = PL->data; + if (!bssn_cuda_patch_contains_point(patch, point)) + { + PL = PL->next; + continue; + } + + double DH[dim]; + for (int d = 0; d < dim; d++) + DH[d] = patch->getdX(d); + + MyList *BP = patch->blb; + while (BP) + { + Block *block = BP->data; + if (bssn_cuda_point_in_block(patch, block, point, DH)) + { + owner_rank = block->rank; + if (myrank == owner_rank) + { + int interp_ordn = ordn; + int interp_sym = Symmetry; + double x = point[0]; + double y = point[1]; + double z = point[2]; + + if (bssn_cuda_has_resident_state(block) && + block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) + { + const int sx = ordn; + const int sy = ordn; + const int sz = ordn; + const int region_all = sx * sy * sz; + const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); + const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); + const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); + double packed_fields[3 * region_all]; + var *vars[3] = {forx, fory, forz}; + for (int f = 0; f < 3; f++) + { + if (bssn_cuda_pack_state_region_to_host_buffer(block, + k_bssn_cuda_bh_state_indices[f], + packed_fields + f * region_all, + block->shape, + i0, j0, k0, + sx, sy, sz) != 0) + { + cout << "CUDA BH tile download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int tile_shape[3] = {sx, sy, sz}; + f_global_interp(tile_shape, + block->X[0] + i0, + block->X[1] + j0, + block->X[2] + k0, + packed_fields + f * region_all, + shellf[f], + x, y, z, + interp_ordn, + vars[f]->SoA, + interp_sym); + } + } + else + { + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forx->sgfn], shellf[0], + x, y, z, interp_ordn, forx->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[fory->sgfn], shellf[1], + x, y, z, interp_ordn, fory->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forz->sgfn], shellf[2], + x, y, z, interp_ordn, forz->SoA, interp_sym); + } + } + break; + } + if (BP == patch->ble) + break; + BP = BP->next; + } + + if (owner_rank >= 0) + break; + PL = PL->next; + } + + if (owner_rank < 0) + return false; + + MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD); + return true; +} + void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) { MyList *Pp = PatL; @@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } int apply_bam_bc = 0; + // bssn_class does not evolve matter source fields; they remain zero-initialized. + int use_zero_matter = 1; int keep_resident_state = use_cuda_resident_sync ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) @@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN) propspeed, soa_flat, Pp->data->bbox, dT_lev, TRK4, iter_count, apply_bam_bc, Symmetry, lev, ndeps, pre, + use_zero_matter, keep_resident_state, apply_enforce_ga, chitiny)) { cout << "CUDA predictor substep failed in domain: (" @@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN) #if USE_CUDA_BSSN const bool need_analysis_state_after_predictor = (lev == a_lev) && (LastAnas + dT_lev >= AnasTime); - const bool need_bh_state_after_predictor = - (BH_num > 0) && (lev == GH->levels - 1); if (use_cuda_resident_sync && need_analysis_state_after_predictor) bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); - else if (use_cuda_resident_sync && need_bh_state_after_predictor) - bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz); #endif #ifdef WithShell @@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } int apply_bam_bc = 0; + // bssn_class does not evolve matter source fields; they remain zero-initialized. + int use_zero_matter = 1; int keep_resident_state = use_cuda_resident_sync ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) @@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN) propspeed, soa_flat, Pp->data->bbox, dT_lev, TRK4, iter_count, apply_bam_bc, Symmetry, lev, ndeps, cor, + use_zero_matter, keep_resident_state, apply_enforce_ga, chitiny)) { cout << "CUDA corrector substep failed in domain: (" @@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN) } #endif -#if USE_CUDA_BSSN - if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3) - bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1); -#endif - // swap time level if (iter_count < 3) { @@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va int lev = ilev; +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev) && + bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf)) + { + BH_RHS[n][0] = -shellf[0]; + BH_RHS[n][1] = -shellf[1]; + BH_RHS[n][2] = -shellf[2]; + continue; + } +#endif + #if (PSTR == 0) while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry)) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index a6d70d9..310f014 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx, ctx.matter_ready = true; } +static void zero_matter_cache(StepContext &ctx, size_t all) +{ + CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0, + (size_t)BSSN_MATTER_COUNT * all * sizeof(double))); + ctx.matter_ready = true; +} + static void bind_matter_slots(const StepContext &ctx) { for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { @@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag, int &Lev, double &eps, int &co, + int &use_zero_matter, int &keep_resident_state, int &apply_enforce_ga, double &chitiny) @@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag, t0 = profile ? cuda_profile_now_ms() : 0.0; if (RK4 == 0) { - upload_matter_cache(ctx, matter_host, all); + if (use_zero_matter) { + if (!ctx.matter_ready) zero_matter_cache(ctx, all); + } else { + upload_matter_cache(ctx, matter_host, all); + } CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToDevice)); } else if (!ctx.matter_ready) { - upload_matter_cache(ctx, matter_host, all); + if (use_zero_matter) zero_matter_cache(ctx, all); + else upload_matter_cache(ctx, matter_host, all); } bind_matter_slots(ctx); if (profile) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 9f473e5..ce8a334 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag, int &Lev, double &eps, int &co, + int &use_zero_matter, int &keep_resident_state, int &apply_enforce_ga, double &chitiny);