From 35b6ceff023fb0753423b0cabd8d1fe694cb12d6 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Fri, 1 May 2026 18:03:04 +0800 Subject: [PATCH] Broaden cached CUDA sync paths --- AMSS_NCKU_source/Parallel.C | 42 +++++++++++++++++++ AMSS_NCKU_source/Z4c_class.C | 4 +- AMSS_NCKU_source/bssnEM_class.C | 4 +- AMSS_NCKU_source/bssnEScalar_class.C | 4 +- AMSS_NCKU_source/z4c_rhs_cuda.cu | 60 ++++++++++++++++++++++++++++ AMSS_NCKU_source/z4c_rhs_cuda.h | 14 +++++++ 6 files changed, 122 insertions(+), 6 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 3e941ff..3174cc2 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -608,6 +608,24 @@ bool cuda_direct_pack_segment_to_device(double *buffer, MyList *VarLists, int Symmetry) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + { + if (type != 1) + return false; + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + const bool ok = z4c_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return ok; + } +#endif #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; @@ -777,6 +795,22 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, int state_count, MyList *VarListd) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + { + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer( + dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; + } +#endif #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; @@ -856,6 +890,10 @@ bool cuda_unpack_host_region_to_resident(Block *block, bool cuda_device_state_count_supported(int state_count) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + return true; +#endif #if USE_CUDA_BSSN return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; #else @@ -933,6 +971,10 @@ int cuda_data_packer_device_batched(double *data, const int state_count = cuda_state_var_count(VarLists, VarListd); if (!cuda_device_state_count_supported(state_count)) return -1; +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count == Z4C_CUDA_STATE_COUNT) + return -1; +#endif int size_out = 0; Block *batch_block = 0; diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 741919f..41afb8a 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -537,7 +537,7 @@ void Z4c_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); if (BH_num > 0 && lev == GH->levels - 1) { @@ -635,7 +635,7 @@ void Z4c_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); if (BH_num > 0 && lev == GH->levels - 1) { diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C index e06b701..0a9e199 100644 --- a/AMSS_NCKU_source/bssnEM_class.C +++ b/AMSS_NCKU_source/bssnEM_class.C @@ -1221,7 +1221,7 @@ void bssnEM_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); #ifdef WithShell if (lev == 0) @@ -1683,7 +1683,7 @@ void bssnEM_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); #ifdef WithShell if (lev == 0) diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index c1e71cd..48c70bc 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -993,7 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); #ifdef WithShell if (lev == 0) @@ -1349,7 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); #ifdef WithShell if (lev == 0) diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 79a11c1..8addc3e 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -5224,6 +5224,36 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, } } +static void copy_state_region_packed_batch_device_cuda(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + int pack_not_unpack) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + + if (pack_not_unpack) { + kern_pack_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + } else { + kern_unpack_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + ctx.state_ready = true; + } +} + static void download_resident_state(void *block_tag, int *ex, double **state_host_out) { const size_t all = (size_t)ex[0] * ex[1] * ex[2]; @@ -7451,6 +7481,36 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, return 0; } +extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 1); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 0); + return 0; +} + extern "C" int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index a1be80b..212965a 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -60,6 +60,20 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count,