From 4e3946a4f0c2e2b177b15f15d768fbaaa630a56f Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 8 Apr 2026 20:59:15 +0800 Subject: [PATCH] Persist GPU RK4 stage caches --- AMSS_NCKU_source/bssn_cuda_ops.cu | 34 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index aaccf8f..b4f95ae 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -3,6 +3,7 @@ #include #include #include +#include namespace { @@ -469,18 +470,22 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int lev, int rk_stage) { - struct Rk4BoundaryCache + struct Rk4VarCache { CachedBuffer X, Y, Z; CachedBuffer state0, boundary, stage, rhs; const double *host_X = nullptr; const double *host_Y = nullptr; const double *host_Z = nullptr; + const double *host_state0 = nullptr; + double *host_rhs = nullptr; int nx = 0; int ny = 0; int nz = 0; + bool rhs_resident = false; }; - static thread_local Rk4BoundaryCache cache; + static thread_local std::unordered_map cache_map; + Rk4VarCache &cache = cache_map[state0]; int nx = ex[0]; int ny = ex[1]; @@ -497,8 +502,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, const bool need_coord_copy = need_bam_boundary; const bool need_boundary_input = need_bam_boundary && (rk_stage != 0); const bool need_stage_input = (rk_stage != 0); - const bool need_rhs_output = (rk_stage != 3); - bool ok = true; if (need_coord_copy && (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || @@ -518,11 +521,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, } } + const bool refresh_state0 = + (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz; + const bool refresh_rhs = + (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum; + ok = ok && - copy_to_device(cache.state0, state0, bytes) && + (!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) && (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) && (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) && - copy_to_device(cache.rhs, rhs_accum, bytes); + (!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes)); if (ok && !need_stage_input) ok = ensure_capacity(cache.stage, bytes); @@ -530,6 +538,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (!ok) return 1; + if (refresh_state0) + cache.host_state0 = state0; + if (refresh_rhs) + { + cache.host_rhs = rhs_accum; + cache.rhs_resident = true; + } + double dX = X[1] - X[0]; double dY = Y[1] - Y[0]; double dZ = Z[1] - Z[0]; @@ -591,12 +607,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; - if (ok && need_rhs_output) - { - err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); - ok = err == cudaSuccess; - } } return ok ? 0 : 1;