From a0af9b8804c743e4aa92dc21b02f9b9042e059d8 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 8 Apr 2026 20:16:25 +0800 Subject: [PATCH] Trim GPU main-path transfer overhead --- AMSS_NCKU_source/bssn_cuda_ops.cu | 26 ++++++--- AMSS_NCKU_source/bssn_gpu.cu | 96 ++++++++++++++++++------------- 2 files changed, 74 insertions(+), 48 deletions(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 0f6b8f4..aaccf8f 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -493,9 +493,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); + const bool need_bam_boundary = (lev == 0); + const bool need_coord_copy = need_bam_boundary; + const bool need_boundary_input = need_bam_boundary && (rk_stage != 0); + const bool need_stage_input = (rk_stage != 0); + const bool need_rhs_output = (rk_stage != 3); + bool ok = true; - if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || - cache.nx != nx || cache.ny != ny || cache.nz != nz) + if (need_coord_copy && + (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || + cache.nx != nx || cache.ny != ny || cache.nz != nz)) { ok = copy_to_device(cache.X, X, bytes_x) && copy_to_device(cache.Y, Y, bytes_y) && @@ -512,10 +519,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, } ok = ok && - copy_to_device(cache.state0, state0, bytes) && - copy_to_device(cache.boundary, boundary_src, bytes) && - copy_to_device(cache.stage, stage_data, bytes) && - copy_to_device(cache.rhs, rhs_accum, bytes); + copy_to_device(cache.state0, state0, bytes) && + (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) && + (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) && + copy_to_device(cache.rhs, rhs_accum, bytes); + + if (ok && !need_stage_input) + ok = ensure_capacity(cache.stage, bytes); if (!ok) return 1; @@ -534,7 +544,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, double soa1 = SoA[1]; double soa2 = SoA[2]; - if (lev == 0) + if (need_bam_boundary) { int imin = 1; int jmin = 1; @@ -581,7 +591,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; - if (ok) + if (ok && need_rhs_output) { err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index b72d8ee..1db2bfe 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -134,6 +134,7 @@ struct GpuRhsCache const double *last_x = nullptr; const double *last_y = nullptr; const double *last_z = nullptr; + bool meta_uploaded = false; }; GpuRhsCache &gpu_rhs_cache() @@ -231,9 +232,31 @@ bool register_gpu_rhs_cleanup() return true; } +void ensure_gpu_rhs_invariant_symbols() +{ + static bool initialized = false; + if (initialized) + return; + + double F1o3h = 1.0 / 3.0; + double F2o3h = 2.0 / 3.0; + double F1o6h = 1.0 / 6.0; + double PIh = M_PI; + int step = GRID_DIM * BLOCK_DIM; + + cudaMemcpyToSymbol(F1o3, &F1o3h, sizeof(double)); + cudaMemcpyToSymbol(F2o3, &F2o3h, sizeof(double)); + cudaMemcpyToSymbol(F1o6, &F1o6h, sizeof(double)); + cudaMemcpyToSymbol(PI, &PIh, sizeof(double)); + cudaMemcpyToSymbol(STEP_SIZE, &step, sizeof(int)); + + initialized = true; +} + bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) { register_gpu_rhs_cleanup(); + ensure_gpu_rhs_invariant_symbols(); const bool shape_changed = !cache.allocated || @@ -261,6 +284,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) cache.last_x = nullptr; cache.last_y = nullptr; cache.last_z = nullptr; + cache.meta_uploaded = false; Meta *meta = &cache.meta; const int matrix_size = cache.matrix_size; @@ -446,7 +470,24 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) return false; } + cudaMemcpyToSymbol(metac, meta, sizeof(Meta)); + + int _1d_size[4]; + int _2d_size[4]; + int _3d_size[4]; + for (int i = 0; i < 4; ++i) + { + _1d_size[i] = ex[0] + i; + _2d_size[i] = _1d_size[i] * (ex[1] + i); + _3d_size[i] = _2d_size[i] * (ex[2] + i); + } + cudaMemcpyToSymbol(ex_c, ex, 3 * sizeof(int)); + cudaMemcpyToSymbol(_1D_SIZE, _1d_size, 4 * sizeof(int)); + cudaMemcpyToSymbol(_2D_SIZE, _2d_size, 4 * sizeof(int)); + cudaMemcpyToSymbol(_3D_SIZE, _3d_size, 4 * sizeof(int)); + cache.allocated = true; + cache.meta_uploaded = true; return true; } @@ -2989,46 +3030,21 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, #endif//if (GAUGE == 6 || GAUGE == 7) -//3.1-----for compute_rhs_bssn--------- - //cout<<"Size of Meta:"<