Persist GPU RK4 stage caches

2026-04-08 20:59:15 +08:00
parent a0af9b8804
commit 4e3946a4f0
1 changed files with 22 additions and 12 deletions
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -3,6 +3,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cuda_runtime.h>
+#include <unordered_map>

 namespace {

@@ -469,18 +470,22 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               int lev,
                               int rk_stage)
 {
-  struct Rk4BoundaryCache
+  struct Rk4VarCache
  {
    CachedBuffer X, Y, Z;
    CachedBuffer state0, boundary, stage, rhs;
    const double *host_X = nullptr;
    const double *host_Y = nullptr;
    const double *host_Z = nullptr;
+    const double *host_state0 = nullptr;
+    double *host_rhs = nullptr;
    int nx = 0;
    int ny = 0;
    int nz = 0;
+    bool rhs_resident = false;
  };
-  static thread_local Rk4BoundaryCache cache;
+  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
+  Rk4VarCache &cache = cache_map[state0];

  int nx = ex[0];
  int ny = ex[1];
@@ -497,8 +502,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
  const bool need_coord_copy = need_bam_boundary;
  const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
  const bool need_stage_input = (rk_stage != 0);
-  const bool need_rhs_output = (rk_stage != 3);
-
  bool ok = true;
  if (need_coord_copy &&
      (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
@@ -518,11 +521,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    }
  }

+  const bool refresh_state0 =
+      (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
+  const bool refresh_rhs =
+      (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
+
  ok = ok &&
-       copy_to_device(cache.state0, state0, bytes) &&
+       (!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
       (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
-       copy_to_device(cache.rhs, rhs_accum, bytes);
+       (!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));

  if (ok && !need_stage_input)
    ok = ensure_capacity(cache.stage, bytes);
@@ -530,6 +538,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
  if (!ok)
    return 1;

+  if (refresh_state0)
+    cache.host_state0 = state0;
+  if (refresh_rhs)
+  {
+    cache.host_rhs = rhs_accum;
+    cache.rhs_resident = true;
+  }
+
  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
@@ -591,12 +607,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
    ok = err == cudaSuccess;
-    if (ok && need_rhs_output)
-    {
-      err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
-      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
-      ok = err == cudaSuccess;
-    }
  }

  return ok ? 0 : 1;