From 531b31e8dbb8e1ec352ecb9cdee3736d69d759c0 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Fri, 1 May 2026 20:04:04 +0800 Subject: [PATCH] Stabilize cached Z4C CUDA sync after regrid --- AMSS_NCKU_source/Parallel.C | 255 ++++++++++---- AMSS_NCKU_source/Z4c_class.C | 39 ++- AMSS_NCKU_source/bssn_class.C | 18 +- AMSS_NCKU_source/z4c_rhs_cuda.cu | 553 +++++++++++++++++++++++++++++++ AMSS_NCKU_source/z4c_rhs_cuda.h | 47 +++ 5 files changed, 833 insertions(+), 79 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 3174cc2..7dc3a34 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -16,7 +16,7 @@ #if USE_CUDA_BSSN || USE_CUDA_Z4C #include #endif -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C #include "bssn_rhs_cuda.h" #endif #if USE_CUDA_Z4C @@ -191,26 +191,26 @@ bool cuda_build_bssn_host_views(Block *block, } return v == 0; } +#endif -bool cuda_build_bssn_soa(MyList *vars, - int state_count, - double *soa_flat) +bool cuda_build_state_soa(MyList *vars, + int state_count, + double *soa_flat) { - if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT) - return false; - MyList *v = vars; - for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) - { - if (!v) - return false; + if (!vars || !soa_flat || state_count <= 0) + return false; + MyList *v = vars; + for (int i = 0; i < state_count; ++i) + { + if (!v) + return false; soa_flat[3 * i + 0] = v->data->SoA[0]; soa_flat[3 * i + 1] = v->data->SoA[1]; soa_flat[3 * i + 2] = v->data->SoA[2]; v = v->next; - } - return v == 0; + } + return v == 0; } -#endif #if USE_CUDA_BSSN || USE_CUDA_Z4C int fortran_idint(double x) @@ -231,7 +231,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, int first_fine[3]) { -#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) +#if (USE_CUDA_BSSN || USE_CUDA_Z4C) && defined(Cell) && (ghost_width == 3) if (!src || !dst || !src->Bg || !dst->Bg) return false; for (int d = 0; d < dim; ++d) @@ -272,7 +272,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, int first_fine_ii[3], int coarse_lb[3]) { -#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) +#if (USE_CUDA_BSSN || USE_CUDA_Z4C) && defined(Cell) && (ghost_width == 3) if (!src || !dst || !src->Bg || !dst->Bg) return false; for (int d = 0; d < dim; ++d) @@ -330,7 +330,30 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg return false; #if USE_CUDA_Z4C && (ABEtype == 2) if (type != 1) + { + static int z4c_amr_device = -1; + if (z4c_amr_device < 0) + { + const char *env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE"); + z4c_amr_device = (env && atoi(env) != 0) ? 1 : 0; + } + if (!z4c_amr_device) + return false; + } + if (type == 2 && !cuda_amr_restrict_device_enabled()) return false; + if (type == 3 && !cuda_amr_prolong_device_enabled()) + return false; + if (type == 2) { + int a[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, a)) + return false; + } + if (type == 3) { + int a[3], b[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, a, b)) + return false; + } return z4c_cuda_has_resident_state(src->Bg) != 0; #elif USE_CUDA_BSSN if (bssn_cuda_has_resident_state(src->Bg) == 0) @@ -362,8 +385,6 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) if (type < 1 || type > 3 || !dst || !dst->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) - if (type != 1) - return false; return z4c_cuda_has_resident_state(dst->Bg) != 0; #elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; @@ -611,16 +632,43 @@ bool cuda_direct_pack_segment_to_device(double *buffer, #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count == Z4C_CUDA_STATE_COUNT) { - if (type != 1) - return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; - const int i0 = cuda_seg_begin(dst, src->Bg, 0); - const int j0 = cuda_seg_begin(dst, src->Bg, 1); - const int k0 = cuda_seg_begin(dst, src->Bg, 2); - const bool ok = z4c_cuda_pack_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + bool ok = false; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); + if (type == 1) + { + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + ok = z4c_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) + return false; + ok = z4c_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) + return false; + ok = z4c_cuda_prolong_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0; + } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; return ok; @@ -634,7 +682,7 @@ bool cuda_direct_pack_segment_to_device(double *buffer, double *views[BSSN_CUDA_STATE_COUNT]; double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); - const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat); + const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); if (type == 1) { const int i0 = cuda_seg_begin(dst, src->Bg, 0); @@ -842,6 +890,29 @@ bool cuda_download_resident_subset_to_host(Block *block, MyList *vars, int state_count) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + { + if (!block || !vars) + return false; + if (z4c_cuda_has_resident_state(block) == 0) + return true; + int indices[Z4C_CUDA_STATE_COUNT]; + double *views[Z4C_CUDA_STATE_COUNT]; + MyList *v = vars; + for (int i = 0; i < state_count; ++i) + { + if (!v) + return false; + indices[i] = i; + views[i] = block->fgfs[v->data->sgfn]; + if (!views[i]) + return false; + v = v->next; + } + return z4c_cuda_download_state_subset(block, block->shape, state_count, indices, views) == 0; + } +#endif #if USE_CUDA_BSSN if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; @@ -870,7 +941,23 @@ bool cuda_unpack_host_region_to_resident(Block *block, double *buffer, const Parallel::gridseg *dst) { -#if USE_CUDA_BSSN +#if USE_CUDA_Z4C && (ABEtype == 2) + if (block && dst && state_index >= 0 && state_index < Z4C_CUDA_STATE_COUNT && + z4c_cuda_has_resident_state(block) == 0) + return true; + if (block && dst && state_index >= 0 && state_index < Z4C_CUDA_STATE_COUNT && + z4c_cuda_has_resident_state(block) != 0) + { + const int i0 = cuda_seg_begin(dst, block, 0); + const int j0 = cuda_seg_begin(dst, block, 1); + const int k0 = cuda_seg_begin(dst, block, 2); + return z4c_cuda_unpack_state_region_from_host_buffer( + block, state_index, buffer, block->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } +#endif + #if USE_CUDA_BSSN if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT) return false; if (bssn_cuda_has_resident_state(block) == 0) @@ -902,7 +989,7 @@ bool cuda_device_state_count_supported(int state_count) #endif } -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C bool cuda_flush_device_segment_batch(Block *block, double *data, int state_count, @@ -913,12 +1000,35 @@ bool cuda_flush_device_segment_batch(Block *block, { if (!block || meta.empty()) return true; - const int stride = (dir == PACK && type == 3) ? 11 : 8; - const int segment_count = (int)(meta.size() / stride); - double *views[BSSN_CUDA_STATE_COUNT]; - double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; - const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); - const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat); + const int stride = (dir == PACK && type == 3) ? 11 : 8; + const int segment_count = (int)(meta.size() / stride); +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + { + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat); + if (dir == PACK) + { + if (type == 2) + return z4c_cuda_restrict_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0; + if (type == 3) + return z4c_cuda_prolong_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0; + return z4c_cuda_pack_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; + } + return z4c_cuda_unpack_state_segments_from_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; + } +#endif +#if USE_CUDA_BSSN + double *views[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); + const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat); if (dir == PACK) { if (type == 2) @@ -941,11 +1051,15 @@ bool cuda_flush_device_segment_batch(Block *block, : bssn_cuda_pack_state_segments_to_device_buffer( block, state_count, data, block->shape, segment_count, meta.data()) == 0; } - return have_views - ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views( - block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 - : bssn_cuda_unpack_state_segments_from_device_buffer( - block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return have_views + ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 + : bssn_cuda_unpack_state_segments_from_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; +#else + (void)data; (void)state_count; (void)dir; (void)type; (void)vars; (void)segment_count; + return false; +#endif } int cuda_data_packer_device_batched(double *data, @@ -971,11 +1085,6 @@ int cuda_data_packer_device_batched(double *data, const int state_count = cuda_state_var_count(VarLists, VarListd); if (!cuda_device_state_count_supported(state_count)) return -1; -#if USE_CUDA_Z4C && (ABEtype == 2) - if (state_count == Z4C_CUDA_STATE_COUNT) - return -1; -#endif - int size_out = 0; Block *batch_block = 0; int batch_type = 0; @@ -1240,7 +1349,7 @@ int data_packer_with_device_buffer(double *data, MyList *VarListd, int Symmetry) { -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C const double batched_t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int batched = cuda_data_packer_device_batched(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry); @@ -5063,20 +5172,37 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && src->data->Bg && bssn_cuda_has_resident_state(src->data->Bg)) - { - if (!cuda_download_resident_subset_to_host(src->data->Bg, VarLists, state_count)) +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (dir == PACK && state_idx == 0 && cuda_state_count_direct_supported(state_count) && + src->data && src->data->Bg) + { + if (!cuda_download_resident_subset_to_host(src->data->Bg, VarLists, state_count)) { cout << "Parallel::data_packer: CUDA resident fallback download failed." << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } #endif - if (dir == PACK) - switch (type) - { + if (dir == PACK) + { + if (!src->data || !src->data->Bg || !src->data->Bg->fgfs || + !varls || !varls->data || + !src->data->Bg->fgfs[varls->data->sgfn]) + { + fprintf(stderr, + "Parallel::data_packer: null host pack source " + "state_count=%d state_idx=%d type=%d rank_in=%d " + "var=%s sgfn=%d src_data=%p src_bg=%p fgfs=%p\n", + state_count, state_idx, type, rank_in, + (varls && varls->data) ? varls->data->name : "(null)", + (varls && varls->data) ? varls->data->sgfn : -1, + (void *)src->data, + (src->data ? (void *)src->data->Bg : 0), + (src->data && src->data->Bg ? (void *)src->data->Bg->fgfs : 0)); + MPI_Abort(MPI_COMM_WORLD, 1); + } + switch (type) + { // attention must be paied to the difference between src's llb,uub and dst's llb,uub case 1: f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, @@ -5088,20 +5214,21 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); break; - case 3: - f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, - dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); - } + case 3: + f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); + } + } if (dir == UNPACK) // from target data to corresponding grid { f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub); -#if USE_CUDA_BSSN - if (cuda_state_count_direct_supported(state_count) && - dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) - { +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (cuda_state_count_direct_supported(state_count) && + dst->data && dst->data->Bg) + { if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) { cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 732c556..b75ed73 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -1,9 +1,10 @@ -#ifdef newc -#include -#include -#include -using namespace std; +#ifdef newc +#include +#include +#include +#include +using namespace std; #else #include #include @@ -215,6 +216,29 @@ bool fill_z4c_cuda_views(Block *cg, MyList *vars, return idx == Z4C_CUDA_STATE_COUNT && vars == 0; } +bool z4c_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) +{ + static int keep_all_levels = -1; + if (keep_all_levels < 0) + { + const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS"); + keep_all_levels = (env && atoi(env) != 0) ? 1 : 0; + } + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + if (!enabled) + return false; + if (lev == analysis_lev) + return false; + if (keep_all_levels) + return true; + return lev < trfls_in; +} + void z4c_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) { MyList *Pp = PatL; @@ -693,7 +717,10 @@ void Z4c_class::Step(int lev, int YN) } } - z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + { + const bool keep_resident = z4c_cuda_keep_resident_after_step(lev, trfls, a_lev); + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident); + } #if (RPS == 0) RestrictProlong(lev, YN, BB); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 87b235d..b6aaf67 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -3029,8 +3029,8 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); -#if (ABEtype != 1 && ABEtype != 2) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } +#if (ABEtype != 1) + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif STEP_TIMER_ADD(TB_REGRID, timer_regrid); #endif @@ -3275,8 +3275,8 @@ void bssn_class::RecursiveStep(int lev) { if (ConstraintRefreshLevels) ConstraintRefreshLevels[lev] = 1; -#if (ABEtype != 1 && ABEtype != 2) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + #if (ABEtype != 1) + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); @@ -3466,7 +3466,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3644,7 +3644,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3670,7 +3670,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3700,7 +3700,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -3727,7 +3727,7 @@ void bssn_class::ParallelStep() SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) { -#if (ABEtype != 1 && ABEtype != 2) +#if (ABEtype != 1) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index d6a46d8..554d8e4 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -420,6 +420,7 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { }; __constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; +__constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT]; static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { 1, 1, 1, @@ -499,6 +500,8 @@ struct StepAllocation { static std::unordered_map g_step_ctx; static std::vector g_step_pool; +static int *g_comm_segment_meta = nullptr; +static size_t g_comm_segment_meta_cap = 0; static StepAllocation empty_step_allocation() { @@ -717,6 +720,39 @@ static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doub return ctx.h_comm_mem; } +static int *ensure_comm_segment_meta_buffer(size_t needed_ints) +{ + if (needed_ints == 0) return nullptr; + if (g_comm_segment_meta_cap < needed_ints) { + if (g_comm_segment_meta) { + CUDA_CHECK(cudaFree(g_comm_segment_meta)); + g_comm_segment_meta = nullptr; + } + CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int))); + g_comm_segment_meta_cap = needed_ints; + } + return g_comm_segment_meta; +} + +static void upload_comm_state_soa(const double *state_soa, int state_count) +{ + double soa[3 * BSSN_STATE_COUNT]; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + soa[3 * i + 0] = 1.0; + soa[3 * i + 1] = 1.0; + soa[3 * i + 2] = 1.0; + } + if (state_soa) { + const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT; + for (int i = 0; i < n; ++i) { + soa[3 * i + 0] = state_soa[3 * i + 0]; + soa[3 * i + 1] = state_soa[3 * i + 1]; + soa[3 * i + 2] = state_soa[3 * i + 2]; + } + } + CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa))); +} + static void upload_grid_params_if_needed(const GridParams &gp) { if (!g_gp_host_cache_valid || @@ -5101,6 +5137,295 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, } } +__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int i0 = m[0], j0 = m[1], k0 = m[2]; + const int sx = m[3], sy = m[4]; + const int region_all = m[6]; + const int offset = m[7]; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst[(size_t)offset + (size_t)state_index * region_all + local] = + src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem, + const double * __restrict__ src, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int i0 = m[0], j0 = m[1], k0 = m[2]; + const int sx = m[3], sy = m[4]; + const int region_all = m[6]; + const int offset = m[7]; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)offset + (size_t)state_index * region_all + local]; + } +} + +__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem, + int state_index, + int x, int y, int z, + int nx, int ny, + int all) +{ + double s = 1.0; + if (x < 0) { + x = -x - 1; + s *= d_comm_state_soa[3 * state_index + 0]; + } + if (y < 0) { + y = -y - 1; + s *= d_comm_state_soa[3 * state_index + 1]; + } + if (z < 0) { + z = -z - 1; + s *= d_comm_state_soa[3 * state_index + 2]; + } + const int src = x + y * nx + z * nx * ny; + return s * src_mem[(size_t)state_index * all + src]; +} + +__global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + const double c1 = 3.0 / 256.0; + const double c2 = -25.0 / 256.0; + const double c3 = 75.0 / 128.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double w[6] = {c1, c2, c3, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fc_i = fi0 + 2 * ii; + const int fc_j = fj0 + 2 * jj; + const int fc_k = fk0 + 2 * kk; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) { + const int z = fc_k + offs[oz]; + const double wz = w[oz]; + for (int oy = 0; oy < 6; ++oy) { + const int y = fc_j + offs[oy]; + const double wyz = wz * w[oy]; + for (int ox = 0; ox < 6; ++ox) { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + +__global__ void kern_restrict_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int sx = m[0], sy = m[1]; + const int region_all = m[3]; + const int offset = m[4]; + const int fi0 = m[5], fj0 = m[6], fk0 = m[7]; + if (state_index >= state_count) return; + const double c1 = 3.0 / 256.0; + const double c2 = -25.0 / 256.0; + const double c3 = 75.0 / 128.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double w[6] = {c1, c2, c3, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fc_i = fi0 + 2 * ii; + const int fc_j = fj0 + 2 * jj; + const int fc_k = fk0 + 2 * kk; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) { + const int z = fc_k + offs[oz]; + const double wz = w[oz]; + for (int oy = 0; oy < 6; ++oy) { + const int y = fc_j + offs[oy]; + const double wyz = wz * w[oy]; + for (int ox = 0; ox < 6; ++ox) { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)offset + (size_t)state_index * region_all + local] = sum; + } +} + +__global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + const double c1 = 77.0 / 8192.0; + const double c2 = -693.0 / 8192.0; + const double c3 = 3465.0 / 4096.0; + const double c4 = 1155.0 / 4096.0; + const double c5 = -495.0 / 8192.0; + const double c6 = 63.0 / 8192.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double wl[6] = {c1, c2, c3, c4, c5, c6}; + const double wr[6] = {c6, c5, c4, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fine_i = ii0 + ii; + const int fine_j = jj0 + jj; + const int fine_k = kk0 + kk; + const int ci = fine_i / 2 - lbc_i; + const int cj = fine_j / 2 - lbc_j; + const int ck = fine_k / 2 - lbc_k; + const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr; + const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; + const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) { + const int z = ck + offs[oz]; + const double wzv = wz[oz]; + for (int oy = 0; oy < 6; ++oy) { + const int y = cj + offs[oy]; + const double wyz = wzv * wy[oy]; + for (int ox = 0; ox < 6; ++ox) { + const int x = ci + offs[ox]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + +__global__ void kern_prolong_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 11; + const int sx = m[0], sy = m[1]; + const int region_all = m[3]; + const int offset = m[4]; + const int ii0 = m[5], jj0 = m[6], kk0 = m[7]; + const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10]; + if (state_index >= state_count) return; + const double c1 = 77.0 / 8192.0; + const double c2 = -693.0 / 8192.0; + const double c3 = 3465.0 / 4096.0; + const double c4 = 1155.0 / 4096.0; + const double c5 = -495.0 / 8192.0; + const double c6 = 63.0 / 8192.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double wl[6] = {c1, c2, c3, c4, c5, c6}; + const double wr[6] = {c6, c5, c4, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fine_i = ii0 + ii; + const int fine_j = jj0 + jj; + const int fine_k = kk0 + kk; + const int ci = fine_i / 2 - lbc_i; + const int cj = fine_j / 2 - lbc_j; + const int ck = fine_k / 2 - lbc_k; + const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr; + const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; + const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) { + const int z = ck + offs[oz]; + const double wzv = wz[oz]; + for (int oy = 0; oy < 6; ++oy) { + const int y = cj + offs[oy]; + const double wyz = wzv * wy[oy]; + for (int ox = 0; ox < 6; ++ox) { + const int x = ci + offs[ox]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)offset + (size_t)state_index * region_all + local] = sum; + } +} + __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, double * __restrict__ dst, int subset_count, @@ -5266,6 +5591,118 @@ static void copy_state_region_packed_batch_device_cuda(void *block_tag, } } +static void copy_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + int pack_not_unpack) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta || !device_buffer) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 8; + if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return; + if (m[6] > max_region_all) max_region_all = m[6]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 8 * sizeof(int), + cudaMemcpyHostToDevice)); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + if (pack_not_unpack) { + kern_pack_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); + } else { + kern_unpack_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); + ctx.state_ready = true; + } +} + +static void restrict_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta || !device_buffer) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 8; + if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return; + if (m[3] > max_region_all) max_region_all = m[3]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 8 * sizeof(int), + cudaMemcpyHostToDevice)); + upload_comm_state_soa(state_soa, state_count); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + kern_restrict_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); +} + +static void prolong_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta || !device_buffer) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 11; + if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return; + if (m[3] > max_region_all) max_region_all = m[3]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 11 * sizeof(int), + cudaMemcpyHostToDevice)); + upload_comm_state_soa(state_soa, state_count); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + kern_prolong_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); +} + static void download_resident_state(void *block_tag, int *ex, double **state_host_out) { const size_t all = (size_t)ex[0] * ex[1] * ex[2]; @@ -7536,6 +7973,122 @@ extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, return 0; } +extern "C" int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 1); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 0); + return 0; +} + +extern "C" int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + restrict_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_soa); + return 0; +} + +extern "C" int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + prolong_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_soa); + return 0; +} + +extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index 212965a..1d958ff 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -74,6 +74,53 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + +int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + +int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + +int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + +int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa); + +int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); + int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count,