From 1ee229a91f1b1890bbf106bb9feb90fa287a4a8b Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 29 Apr 2026 19:44:19 +0800 Subject: [PATCH] Add keyed BSSN CUDA resident banks --- AMSS_NCKU_source/Parallel.C | 194 ++++++-- AMSS_NCKU_source/bssn_class.C | 13 +- AMSS_NCKU_source/bssn_rhs_cuda.cu | 718 ++++++++++++++++++++++++++++-- AMSS_NCKU_source/bssn_rhs_cuda.h | 74 +++ 4 files changed, 910 insertions(+), 89 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 79a89fe..ead3566 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -172,6 +172,26 @@ int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) return (src_vars || dst_vars) ? -1 : count; } +#if USE_CUDA_BSSN +bool cuda_build_bssn_host_views(Block *block, + MyList *vars, + int state_count, + double **views) +{ + if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT) + return false; + MyList *v = vars; + for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) + { + if (!v) + return false; + views[i] = block->fgfs[v->data->sgfn]; + v = v->next; + } + return v == 0; +} +#endif + #if USE_CUDA_BSSN || USE_CUDA_Z4C int fortran_idint(double x) { @@ -318,7 +338,8 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) bool cuda_direct_pack_segment(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, - int state_count) + int state_count, + MyList *VarLists) { #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count != Z4C_CUDA_STATE_COUNT) @@ -338,9 +359,17 @@ bool cuda_direct_pack_segment(double *buffer, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else - const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); + const bool ok = have_views + ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : bssn_cuda_pack_state_batch_to_host_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #endif if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; @@ -349,7 +378,8 @@ bool cuda_direct_pack_segment(double *buffer, bool cuda_direct_unpack_segment(double *buffer, const Parallel::gridseg *dst, - int state_count) + int state_count, + MyList *VarListd) { #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count != Z4C_CUDA_STATE_COUNT) @@ -369,9 +399,17 @@ bool cuda_direct_unpack_segment(double *buffer, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else - const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); + const bool ok = have_views + ? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views( + dst->Bg, views, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : bssn_cuda_unpack_state_batch_from_host_buffer( + dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #endif if (sync_profile_enabled()) sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; @@ -441,43 +479,62 @@ bool cuda_direct_pack_segment_to_device(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, int state_count, - int type) + int type, + MyList *VarLists) { #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; bool ok = false; + double *views[BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); if (type == 1) { const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); - ok = bssn_cuda_pack_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + ok = have_views + ? bssn_cuda_pack_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : bssn_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; } else if (type == 2) { int first_fine[3]; if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) return false; - ok = bssn_cuda_restrict_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine[0], first_fine[1], first_fine[2]) == 0; + ok = have_views + ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2]) == 0 + : bssn_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2]) == 0; } else if (type == 3) { int first_fine_ii[3], coarse_lb[3]; if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) return false; - ok = bssn_cuda_prolong_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], - coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0; + ok = have_views + ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0 + : bssn_cuda_prolong_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0; } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; @@ -490,7 +547,8 @@ bool cuda_direct_pack_segment_to_device(double *buffer, bool cuda_direct_unpack_segment_from_device(double *buffer, const Parallel::gridseg *dst, - int state_count) + int state_count, + MyList *VarListd) { #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) @@ -499,10 +557,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); - const bool ok = bssn_cuda_unpack_state_batch_from_device_buffer( - dst->Bg, state_count, buffer, dst->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); + const bool ok = have_views + ? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views( + dst->Bg, views, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : bssn_cuda_unpack_state_batch_from_device_buffer( + dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; if (sync_profile_enabled()) sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; return ok; @@ -577,16 +642,25 @@ bool cuda_flush_device_segment_batch(Block *block, double *data, int state_count, const std::vector &meta, - int dir) + int dir, + MyList *vars) { if (!block || meta.empty()) return true; const int segment_count = (int)(meta.size() / 8); + double *views[BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); if (dir == PACK) - return bssn_cuda_pack_state_segments_to_device_buffer( - block, state_count, data, block->shape, segment_count, meta.data()) == 0; - return bssn_cuda_unpack_state_segments_from_device_buffer( - block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return have_views + ? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 + : bssn_cuda_pack_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return have_views + ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 + : bssn_cuda_unpack_state_segments_from_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; } int cuda_data_packer_device_batched(double *data, @@ -638,7 +712,8 @@ int cuda_data_packer_device_batched(double *data, if (batch_block && batch_block != block) { - if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir)) + MyList *batch_vars = (dir == PACK) ? VarLists : VarListd; + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars)) return -1; batch_meta.clear(); } @@ -672,7 +747,8 @@ int cuda_data_packer_device_batched(double *data, if (batch_block) { - if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir)) + MyList *batch_vars = (dir == PACK) ? VarLists : VarListd; + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars)) return -1; } return size_out; @@ -4542,9 +4618,9 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { if (s_cuda_aware_pack_active) { - handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type); + handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists); } else { - handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); + handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists); } if (!handled_by_cuda) { @@ -4557,9 +4633,9 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) { if (s_cuda_aware_pack_active) { - handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count); + handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd); } else { - handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count); + handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count, VarListd); } if (!handled_by_cuda) { @@ -6476,6 +6552,26 @@ void Parallel::prepare_inter_time_level(Patch *Pat, Block *cg = BP->data; if (myrank == cg->rank) { +#if USE_CUDA_BSSN + double *src1_views[BSSN_CUDA_STATE_COUNT]; + double *src2_views[BSSN_CUDA_STATE_COUNT]; + double *dst_views[BSSN_CUDA_STATE_COUNT]; + const int state_count = cuda_state_var_count(VarList1, VarList2); + if (state_count == BSSN_CUDA_STATE_COUNT && + cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && + bssn_cuda_has_resident_state(cg) && + bssn_cuda_prepare_inter_time_level(cg, cg->shape, + src1_views, src2_views, 0, dst_views, + 2, tindex) == 0) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } +#endif varl1 = VarList1; varl2 = VarList2; varl3 = VarList3; @@ -6531,6 +6627,28 @@ void Parallel::prepare_inter_time_level(Patch *Pat, Block *cg = BP->data; if (myrank == cg->rank) { +#if USE_CUDA_BSSN + double *src1_views[BSSN_CUDA_STATE_COUNT]; + double *src2_views[BSSN_CUDA_STATE_COUNT]; + double *src3_views[BSSN_CUDA_STATE_COUNT]; + double *dst_views[BSSN_CUDA_STATE_COUNT]; + const int state_count = cuda_state_var_count(VarList1, VarList2); + if (state_count == BSSN_CUDA_STATE_COUNT && + cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && + cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) && + bssn_cuda_has_resident_state(cg) && + bssn_cuda_prepare_inter_time_level(cg, cg->shape, + src1_views, src2_views, src3_views, dst_views, + 3, tindex) == 0) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } +#endif varl1 = VarList1; varl2 = VarList2; varl3 = VarList3; diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index cdd56e3..96d1fef 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -273,6 +273,12 @@ bool bssn_cuda_use_resident_sync(int lev) bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) { + static int keep_all_levels = -1; + if (keep_all_levels < 0) + { + const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS"); + keep_all_levels = (env && atoi(env) != 0) ? 1 : 0; + } static int enabled = -1; if (enabled < 0) { @@ -281,11 +287,12 @@ bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) } if (!enabled) return false; - // Levels at and above trfls can be read by CPU time interpolation through - // State/Old/corrector lists. Keep those conservative until multi-time-level - // resident storage is implemented. if (lev == analysis_lev) return false; + if (keep_all_levels) + return true; + // Conservative default: high time-refinement levels still have scattered + // CPU consumers outside the RK4/AMR exchange path. return lev < trfls_in; } diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 03b2b73..80e3918 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -437,6 +437,7 @@ static const int STAGE_SLOT_COUNT = static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; +static constexpr int BSSN_RESIDENT_BANK_COUNT = 4; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -512,6 +513,7 @@ struct StepContext { double *d_accum_mem; double *d_state_curr_mem; double *d_state_next_mem; + std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; double *h_comm_mem; @@ -519,6 +521,10 @@ struct StepContext { std::array d_accum; std::array d_state_curr; std::array d_state_next; + std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; + std::array, BSSN_RESIDENT_BANK_COUNT> resident_host; + std::array resident_age; + std::array resident_valid; std::array d_matter; size_t cap_all; size_t cap_comm; @@ -526,18 +532,29 @@ struct StepContext { size_t cap_h_comm; bool matter_ready; bool state_ready; + int current_bank; + unsigned long long resident_clock; StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), + d_resident_mem{}, d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), - matter_ready(false), state_ready(false) + matter_ready(false), state_ready(false), + current_bank(-1), resident_clock(0) { + d_resident_mem.fill(nullptr); d_state0.fill(nullptr); d_accum.fill(nullptr); d_state_curr.fill(nullptr); d_state_next.fill(nullptr); + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + d_resident[b].fill(nullptr); + resident_host[b].fill(nullptr); + } + resident_age.fill(0); + resident_valid.fill(false); d_matter.fill(nullptr); } }; @@ -545,8 +562,7 @@ struct StepContext { struct StepAllocation { double *d_state0_mem; double *d_accum_mem; - double *d_state_curr_mem; - double *d_state_next_mem; + std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; double *h_comm_mem; @@ -563,11 +579,17 @@ static size_t g_comm_segment_meta_cap = 0; static StepAllocation empty_step_allocation() { - StepAllocation alloc = { - nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr, - 0, 0, false, 0 - }; + StepAllocation alloc = {}; + alloc.d_state0_mem = nullptr; + alloc.d_accum_mem = nullptr; + alloc.d_resident_mem.fill(nullptr); + alloc.d_matter_mem = nullptr; + alloc.d_comm_mem = nullptr; + alloc.h_comm_mem = nullptr; + alloc.cap_all = 0; + alloc.cap_comm = 0; + alloc.h_comm_pinned = false; + alloc.cap_h_comm = 0; return alloc; } @@ -578,16 +600,22 @@ static bool has_step_allocation(const StepAllocation &alloc) static StepAllocation detach_step_allocation(StepContext &ctx) { - StepAllocation alloc = { - ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem, - ctx.d_state_next_mem, ctx.d_matter_mem, - ctx.d_comm_mem, ctx.h_comm_mem, - ctx.cap_all, ctx.cap_comm, ctx.h_comm_pinned, ctx.cap_h_comm - }; + StepAllocation alloc = {}; + alloc.d_state0_mem = ctx.d_state0_mem; + alloc.d_accum_mem = ctx.d_accum_mem; + alloc.d_resident_mem = ctx.d_resident_mem; + alloc.d_matter_mem = ctx.d_matter_mem; + alloc.d_comm_mem = ctx.d_comm_mem; + alloc.h_comm_mem = ctx.h_comm_mem; + alloc.cap_all = ctx.cap_all; + alloc.cap_comm = ctx.cap_comm; + alloc.h_comm_pinned = ctx.h_comm_pinned; + alloc.cap_h_comm = ctx.cap_h_comm; ctx.d_state0_mem = nullptr; ctx.d_accum_mem = nullptr; ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; + ctx.d_resident_mem.fill(nullptr); ctx.d_matter_mem = nullptr; ctx.d_comm_mem = nullptr; ctx.h_comm_mem = nullptr; @@ -597,10 +625,18 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.cap_h_comm = 0; ctx.matter_ready = false; ctx.state_ready = false; + ctx.current_bank = -1; + ctx.resident_clock = 0; ctx.d_state0.fill(nullptr); ctx.d_accum.fill(nullptr); ctx.d_state_curr.fill(nullptr); ctx.d_state_next.fill(nullptr); + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + ctx.d_resident[b].fill(nullptr); + ctx.resident_host[b].fill(nullptr); + } + ctx.resident_age.fill(0); + ctx.resident_valid.fill(false); ctx.d_matter.fill(nullptr); return alloc; } @@ -609,8 +645,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc { ctx.d_state0_mem = alloc.d_state0_mem; ctx.d_accum_mem = alloc.d_accum_mem; - ctx.d_state_curr_mem = alloc.d_state_curr_mem; - ctx.d_state_next_mem = alloc.d_state_next_mem; + ctx.d_resident_mem = alloc.d_resident_mem; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_next_mem = nullptr; ctx.d_matter_mem = alloc.d_matter_mem; ctx.d_comm_mem = alloc.d_comm_mem; ctx.h_comm_mem = alloc.h_comm_mem; @@ -620,6 +657,13 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.cap_h_comm = alloc.cap_h_comm; ctx.matter_ready = false; ctx.state_ready = false; + ctx.current_bank = -1; + ctx.resident_clock = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + ctx.resident_host[b].fill(nullptr); + } + ctx.resident_age.fill(0); + ctx.resident_valid.fill(false); } static void recycle_step_allocation(StepAllocation &alloc) @@ -708,8 +752,10 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) if (!has_step_allocation(alloc)) { CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b], + BSSN_STATE_COUNT * all * sizeof(double))); + } CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); alloc.cap_all = all; } @@ -718,8 +764,13 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; - ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all; - ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all; + } + } + if (ctx.current_bank >= 0) { + ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank]; + ctx.d_state_curr = ctx.d_resident[ctx.current_bank]; } for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; @@ -4825,6 +4876,263 @@ static void bind_state_output_slots(const std::array } } +static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key) +{ + if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) + return false; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i]) + return false; + } + return true; +} + +static int find_resident_bank(const StepContext &ctx, double **host_key) +{ + if (!host_key) return -1; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (resident_key_matches(ctx, b, host_key)) + return b; + } + return -1; +} + +static bool resident_key_usable(double **host_key) +{ + if (!host_key) return false; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (!host_key[i]) return false; + } + return true; +} + +static void mark_resident_current_bank(StepContext &ctx, int bank) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + ctx.current_bank = bank; + ctx.d_state_curr_mem = ctx.d_resident_mem[bank]; + ctx.d_state_curr = ctx.d_resident[bank]; + ctx.state_ready = ctx.resident_valid[bank]; +} + +static void mark_resident_next_bank(StepContext &ctx, int bank) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + ctx.d_state_next_mem = ctx.d_resident_mem[bank]; + ctx.d_state_next = ctx.d_resident[bank]; +} + +static bool any_resident_bank_valid(const StepContext &ctx) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (ctx.resident_valid[b]) return true; + } + return false; +} + +static void update_state_ready(StepContext &ctx) +{ + ctx.state_ready = any_resident_bank_valid(ctx); +} + +static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + if (!ctx.resident_valid[bank]) return; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (!ctx.resident_host[bank][i]) return; + } + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i], + ctx.d_resident[bank][i], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); +} + +static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b != avoid_bank && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b == avoid_bank) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) best = 0; + writeback_resident_bank(ctx, best, all); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + +static void assign_resident_key(StepContext &ctx, int bank, double **host_key) +{ + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + ctx.resident_host[bank][i] = host_key[i]; + } + ctx.resident_age[bank] = ++ctx.resident_clock; +} + +static int ensure_resident_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_bank = -1) +{ + if (!resident_key_usable(host_key)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank(ctx, host_key); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + } + return bank; + } + + bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all); + assign_resident_key(ctx, bank, host_key); + if (upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + } else { + ctx.resident_valid[bank] = false; + } + update_state_ready(ctx); + return bank; +} + +static int reserve_resident_output_bank(StepContext &ctx, + double **host_key, + size_t all, + int input_bank) +{ + if (!resident_key_usable(host_key)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches(ctx, input_bank, host_key)) + return input_bank; + + int bank = find_resident_bank(ctx, host_key); + if (bank < 0) + bank = choose_resident_bank_for_reuse(ctx, input_bank, all); + assign_resident_key(ctx, bank, host_key); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + +static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c) +{ + return bank == avoid_a || bank == avoid_b || bank == avoid_c; +} + +static int choose_resident_bank_for_reuse_avoiding(StepContext &ctx, + int avoid_a, + int avoid_b, + int avoid_c, + size_t all) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) + return choose_resident_bank_for_reuse(ctx, avoid_a, all); + + writeback_resident_bank(ctx, best, all); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + +static int reserve_resident_output_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable(host_key)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches(ctx, avoid_a, host_key)) + return avoid_a; + if (resident_key_matches(ctx, avoid_b, host_key)) + return avoid_b; + if (resident_key_matches(ctx, avoid_c, host_key)) + return avoid_c; + + int bank = find_resident_bank(ctx, host_key); + if (bank < 0) + bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); + assign_resident_key(ctx, bank, host_key); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + +static int active_or_keyed_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing) +{ + if (resident_key_usable(host_key)) { + int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing); + mark_resident_current_bank(ctx, bank); + return bank; + } + if (ctx.current_bank >= 0) + return ctx.current_bank; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (ctx.resident_valid[b]) { + mark_resident_current_bank(ctx, b); + return b; + } + } + return 0; +} + static void launch_rhs_pipeline(int all, double eps, int co) { const double SYM = 1.0; @@ -5165,6 +5473,28 @@ static void download_constraint_outputs(double **constraint_host_out, size_t all } } +__global__ void kern_prepare_inter_time_level(const double * __restrict__ src1, + const double * __restrict__ src2, + const double * __restrict__ src3, + double * __restrict__ dst, + double c1, + double c2, + double c3, + int state_count, + int all) +{ + const int state = blockIdx.y; + if (state >= state_count) return; + const size_t off = (size_t)state * all; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < all; + i += blockDim.x * gridDim.x) + { + const double v3 = src3 ? src3[off + i] : 0.0; + dst[off + i] = c1 * src1[off + i] + c2 * src2[off + i] + c3 * v3; + } +} + __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -5416,13 +5746,18 @@ static void copy_state_region_cuda(void *block_tag, const int *ex, int i0, int j0, int k0, int sx, int sy, int sz, - cudaMemcpyKind kind) + cudaMemcpyKind kind, + double **state_host_key = nullptr) { if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; const size_t pitch = (size_t)ex[0] * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + kind == cudaMemcpyHostToDevice); + double *base_mem = ctx.d_resident_mem[bank]; cudaMemcpy3DParms p = {}; p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); @@ -5430,13 +5765,19 @@ static void copy_state_region_cuda(void *block_tag, p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); if (kind == cudaMemcpyDeviceToHost) { - p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]); p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); } else { p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); - p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]); } CUDA_CHECK(cudaMemcpy3D(&p)); + if (kind == cudaMemcpyHostToDevice) { + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); + } } static void copy_state_region_packed_cuda(void *block_tag, @@ -5445,7 +5786,8 @@ static void copy_state_region_packed_cuda(void *block_tag, const int *ex, int i0, int j0, int k0, int sx, int sy, int sz, - cudaMemcpyKind kind) + cudaMemcpyKind kind, + double **state_host_key = nullptr) { if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; @@ -5453,23 +5795,33 @@ static void copy_state_region_packed_cuda(void *block_tag, const size_t src_pitch = (size_t)ex[0] * sizeof(double); const size_t dst_pitch = (size_t)sx * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + kind == cudaMemcpyHostToDevice); + double *base_mem = ctx.d_resident_mem[bank]; cudaMemcpy3DParms p = {}; p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); if (kind == cudaMemcpyDeviceToHost) { - p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]); p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); p.dstPos = make_cudaPos(0, 0, 0); } else { p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); p.srcPos = make_cudaPos(0, 0, 0); - p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]); p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); } CUDA_CHECK(cudaMemcpy3D(&p)); + if (kind == cudaMemcpyHostToDevice) { + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); + } } static void copy_state_region_packed_batch_cuda(void *block_tag, @@ -5478,12 +5830,17 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, const int *ex, int i0, int j0, int k0, int sx, int sy, int sz, - cudaMemcpyKind kind) + cudaMemcpyKind kind, + double **state_host_key = nullptr) { if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + kind == cudaMemcpyHostToDevice); + double *base_mem = ctx.d_resident_mem[bank]; const int region_all = sx * sy * sz; const size_t total_doubles = (size_t)state_count * (size_t)region_all; double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); @@ -5492,7 +5849,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_pack_state_region_batch<<>>( - ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + base_mem, d_comm, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, @@ -5505,9 +5862,13 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_unpack_state_region_batch<<>>( - ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + base_mem, d_comm, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); } } @@ -5516,6 +5877,11 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); + int bank = find_resident_bank(ctx, state_host_out); + if (bank < 0) { + bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false); + } + mark_resident_current_bank(ctx, bank); const bool profile = cuda_profile_enabled(); const double t0 = profile ? cuda_profile_now_ms() : 0.0; static int direct_download = -1; @@ -5525,7 +5891,7 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } if (direct_download) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { - CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i], + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); @@ -5537,7 +5903,7 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } return; } - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToHost)); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { @@ -5562,6 +5928,10 @@ static void copy_state_subset(void *block_tag, const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); + double **full_key = (subset_count == BSSN_STATE_COUNT) ? state_host : nullptr; + const int bank = active_or_keyed_bank(ctx, full_key, all, + kind == cudaMemcpyHostToDevice); + double *base_mem = ctx.d_resident_mem[bank]; int active_state_indices[BSSN_STATE_COUNT]; double *active_state_host[BSSN_STATE_COUNT]; int active_count = 0; @@ -5586,7 +5956,7 @@ static void copy_state_subset(void *block_tag, if (kind == cudaMemcpyDeviceToHost) { dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); kern_pack_state_subset<<>>( - ctx.d_state_curr_mem, d_comm, active_count, (int)all); + base_mem, d_comm, active_count, (int)all); CUDA_CHECK(cudaMemcpy(h_comm, d_comm, total_doubles * sizeof(double), cudaMemcpyDeviceToHost)); @@ -5606,14 +5976,18 @@ static void copy_state_subset(void *block_tag, cudaMemcpyHostToDevice)); dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); kern_unpack_state_subset<<>>( - ctx.d_state_curr_mem, d_comm, active_count, (int)all); + base_mem, d_comm, active_count, (int)all); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); } } static bool has_resident_state(void *block_tag) { auto it = g_step_ctx.find(block_tag); - return it != g_step_ctx.end() && it->second.state_ready; + return it != g_step_ctx.end() && any_resident_bank_valid(it->second); } /* ================================================================== */ @@ -6001,7 +6375,6 @@ int bssn_cuda_rk4_substep(void *block_tag, double &chitiny) { (void)T; - (void)state_host_out; if (RK4 < 0 || RK4 > 3) return 1; @@ -6031,12 +6404,18 @@ int bssn_cuda_rk4_substep(void *block_tag, } StepContext &ctx = ensure_step_ctx(block_tag, all); const bool use_resident_state = (keep_resident_state != 0); + int input_bank = -1; + int output_bank = -1; if (use_resident_state) { - bind_state_input_slots(ctx.d_state_curr); - bind_state_output_slots(ctx.d_state_next); + input_bank = ensure_resident_bank(ctx, state_host_in, all, true); + output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank); + mark_resident_current_bank(ctx, input_bank); + mark_resident_next_bank(ctx, output_bank); + bind_state_input_slots(ctx.d_resident[input_bank]); + bind_state_output_slots(ctx.d_resident[output_bank]); } double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (!use_resident_state || !ctx.state_ready) { + if (!use_resident_state) { upload_state_inputs(state_host_in, all); } if (apply_enforce_ga) { @@ -6057,7 +6436,10 @@ int bssn_cuda_rk4_substep(void *block_tag, } else { upload_matter_cache(ctx, matter_host, all); } - CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], + const double *state0_src = use_resident_state + ? ctx.d_resident_mem[input_bank] + : g_buf.slot[S_chi]; + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src, (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToDevice)); } else if (!ctx.matter_ready) { @@ -6109,9 +6491,10 @@ int bssn_cuda_rk4_substep(void *block_tag, t0 = profile ? cuda_profile_now_ms() : 0.0; if (use_resident_state) { - std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem); - ctx.d_state_curr.swap(ctx.d_state_next); - ctx.state_ready = true; + ctx.resident_valid[output_bank] = true; + ctx.resident_age[output_bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, output_bank); + update_state_ready(ctx); } else { download_state_outputs(state_host_out, all); } @@ -6233,6 +6616,24 @@ int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyDeviceToHost, + state_host_key); + return 0; +} + extern "C" int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int state_count, @@ -6249,34 +6650,61 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice, + state_host_key); + return 0; +} + static void copy_state_device_batch(void *block_tag, int state_count, double *device_buffer, const int *ex, int i0, int j0, int k0, int sx, int sy, int sz, - int pack_not_unpack) + int pack_not_unpack, + double **state_host_key = nullptr) { if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + pack_not_unpack == 0); + double *base_mem = ctx.d_resident_mem[bank]; const int region_all = sx * sy * sz; dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); if (pack_not_unpack) { kern_pack_state_region_batch<<>>( - ctx.d_state_curr_mem, device_buffer, + base_mem, device_buffer, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); } else { kern_unpack_state_region_batch<<>>( - ctx.d_state_curr_mem, device_buffer, + base_mem, device_buffer, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); } } @@ -6286,7 +6714,8 @@ static void copy_state_device_segments(void *block_tag, const int *ex, int segment_count, const int *segment_meta, - int pack_not_unpack) + int pack_not_unpack, + double **state_host_key = nullptr) { if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; if (segment_count <= 0 || !segment_meta) return; @@ -6300,6 +6729,10 @@ static void copy_state_device_segments(void *block_tag, if (max_region_all <= 0) return; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + pack_not_unpack == 0); + double *base_mem = ctx.d_resident_mem[bank]; int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, (size_t)segment_count * 8 * sizeof(int), @@ -6310,14 +6743,18 @@ static void copy_state_device_segments(void *block_tag, (unsigned int)segment_count); if (pack_not_unpack) { kern_pack_state_segments_batch<<>>( - ctx.d_state_curr_mem, device_buffer, + base_mem, device_buffer, ex[0], ex[1], d_meta, state_count, ex[0] * ex[1] * ex[2]); } else { kern_unpack_state_segments_batch<<>>( - ctx.d_state_curr_mem, device_buffer, + base_mem, device_buffer, ex[0], ex[1], d_meta, state_count, ex[0] * ex[1] * ex[2]); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); } } @@ -6336,6 +6773,22 @@ int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_batch(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 1, state_host_key); + return 0; +} + extern "C" int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int state_count, @@ -6351,6 +6804,22 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_batch(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 0, state_host_key); + return 0; +} + extern "C" int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, int state_count, @@ -6366,6 +6835,22 @@ int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 1, state_host_key); + return 0; +} + extern "C" int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int state_count, @@ -6381,6 +6866,22 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 0, state_host_key); + return 0; +} + extern "C" int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -6405,6 +6906,33 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_resident_mem[bank], device_buffer, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -6431,6 +6959,35 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_resident_mem[bank], device_buffer, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int bssn_cuda_download_state_subset(void *block_tag, int *ex, @@ -6459,6 +7016,71 @@ int bssn_cuda_upload_state_subset(void *block_tag, return 0; } +extern "C" +int bssn_cuda_prepare_inter_time_level(void *block_tag, + int *ex, + double **src1_host_key, + double **src2_host_key, + double **src3_host_key, + double **dst_host_key, + int source_count, + int tindex) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (source_count != 2 && source_count != 3) return 1; + if (!resident_key_usable(src1_host_key) || + !resident_key_usable(src2_host_key) || + !resident_key_usable(dst_host_key)) + return 1; + if (source_count == 3 && !resident_key_usable(src3_host_key)) + return 1; + + double c1 = 0.0, c2 = 0.0, c3 = 0.0; + if (source_count == 2) { + if (tindex == 0) { + c1 = 0.5; c2 = 0.5; + } else if (tindex == 1) { + c1 = 0.75; c2 = 0.25; + } else if (tindex == -1) { + c1 = 0.25; c2 = 0.75; + } else { + return 1; + } + } else { + if (tindex == 0) { + c1 = 3.0 / 8.0; c2 = 3.0 / 4.0; c3 = -1.0 / 8.0; + } else if (tindex == 1 || tindex == -1) { + c1 = 5.0 / 32.0; c2 = 15.0 / 16.0; c3 = -3.0 / 32.0; + } else { + return 1; + } + } + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + StepContext &ctx = ensure_step_ctx(block_tag, all); + const int src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); + const int src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); + const int src3_bank = (source_count == 3) + ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank) + : -1; + const int dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all, + src1_bank, src2_bank, src3_bank); + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT); + kern_prepare_inter_time_level<<>>( + ctx.d_resident_mem[src1_bank], + ctx.d_resident_mem[src2_bank], + (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr, + ctx.d_resident_mem[dst_bank], + c1, c2, c3, BSSN_STATE_COUNT, (int)all); + ctx.resident_valid[dst_bank] = true; + ctx.resident_age[dst_bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, dst_bank); + update_state_ready(ctx); + return 0; +} + extern "C" int bssn_cuda_has_resident_state(void *block_tag) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 5df93a9..fda21d6 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -97,6 +97,14 @@ int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int state_count, double *host_buffer, @@ -104,6 +112,14 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -111,6 +127,14 @@ int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -118,6 +142,14 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -125,6 +157,14 @@ int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, int segment_count, const int *segment_meta); +int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -132,6 +172,14 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int segment_count, const int *segment_meta); +int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -139,6 +187,14 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int sx, int sy, int sz, int fi0, int fj0, int fk0); +int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0); + int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -147,6 +203,15 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int ii0, int jj0, int kk0, int lbc_i, int lbc_j, int lbc_k); +int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k); + int bssn_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, @@ -159,6 +224,15 @@ int bssn_cuda_upload_state_subset(void *block_tag, const int *state_indices, double **state_host_in); +int bssn_cuda_prepare_inter_time_level(void *block_tag, + int *ex, + double **src1_host_key, + double **src2_host_key, + double **src3_host_key, + double **dst_host_key, + int source_count, + int tindex); + int bssn_cuda_has_resident_state(void *block_tag); void bssn_cuda_release_step_ctx(void *block_tag);