From c578a15ecde5f1468ded844d5fcec4e6b2672f18 Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Fri, 10 Apr 2026 10:29:04 +0800
Subject: [PATCH] Fix GPU interpolation cache lifetime leaks

---
 AMSS_NCKU_source/Block.C          |   3 +
 AMSS_NCKU_source/MPatch.C         |  60 +++++--
 AMSS_NCKU_source/MPatch.h         |  10 +-
 AMSS_NCKU_source/bssn_cuda_ops.cu | 252 ++++++++++++++++++++----------
 AMSS_NCKU_source/bssn_cuda_ops.h  |   2 +
 AMSS_NCKU_source/cgh.C            |  13 ++
 6 files changed, 241 insertions(+), 99 deletions(-)
diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C
index b875b27..e921cfa 100644
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -13,6 +13,7 @@ using namespace std;
 #include "misc.h"
 #ifdef USE_GPU
 #include "bssn_gpu.h"
+#include "bssn_cuda_ops.h"
 #endif
 
 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
@@ -106,6 +107,8 @@ Block::~Block()
   {
 #ifdef USE_GPU
     bssn_gpu_clear_cached_device_buffers();
+    bssn_cuda_release_rk4_caches();
+    bssn_cuda_release_interp_caches();
 #endif
     for (int i = 0; i < dim; i++)
       delete[] X[i];
diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C
index 68ac732..29680d8 100644
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -79,6 +79,15 @@ struct CachedInterpPlan
   CachedInterpPlan() : nblocks(0) {}
 };
 
+struct CachedInterpPlanEntry
+{
+  bool valid;
+  InterpPlanKey key;
+  CachedInterpPlan plan;
+
+  CachedInterpPlanEntry() : valid(false) {}
+};
+
 struct InterpBlockView
 {
   Block *bp;
@@ -268,6 +277,23 @@ bool should_try_cuda_interp(int ordn, int num_points, int num_var)
   return num_points * num_var >= 256;
 }
 
+CachedInterpPlanEntry &interp_plan_cache_entry()
+{
+  static CachedInterpPlanEntry cache;
+  return cache;
+}
+
+bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
+{
+  return lhs.patch == rhs.patch &&
+         lhs.x == rhs.x &&
+         lhs.y == rhs.y &&
+         lhs.z == rhs.z &&
+         lhs.NN == rhs.NN &&
+         lhs.Symmetry == rhs.Symmetry &&
+         lhs.myrank == rhs.myrank;
+}
+
 CachedInterpPlan &get_cached_interp_plan(Patch *patch,
                                          int NN, double **XX,
                                          int Symmetry, int myrank,
@@ -276,8 +302,6 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
                                          bool report_bounds_here,
                                          bool allow_missing_points)
 {
-  static map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess> cache;
-
   InterpPlanKey key;
   key.patch = patch;
   key.x = XX[0];
@@ -287,12 +311,16 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
   key.Symmetry = Symmetry;
   key.myrank = myrank;
 
-  map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess>::iterator it = cache.find(key);
-  if (it != cache.end() && it->second.nblocks == static_cast<int>(block_index.views.size()))
-    return it->second;
+  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
+  if (cache.valid &&
+      same_interp_plan_key(cache.key, key) &&
+      cache.plan.nblocks == static_cast<int>(block_index.views.size()))
+    return cache.plan;
 
-  CachedInterpPlan &plan = cache[key];
-  plan = CachedInterpPlan();
+  cache.valid = true;
+  cache.key = key;
+  cache.plan = CachedInterpPlan();
+  CachedInterpPlan &plan = cache.plan;
   plan.nblocks = static_cast<int>(block_index.views.size());
   plan.owner_rank.assign(NN, -1);
   plan.owner_block.assign(NN, -1);
@@ -380,6 +408,13 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
   return plan;
 }
 
+void release_interp_plan_cache_internal()
+{
+  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
+  cache.valid = false;
+  cache.plan = CachedInterpPlan();
+}
+
 bool run_cuda_interp_for_block(Block *BP,
                                const vector<InterpVarDesc> &vars,
                                const vector<int> &point_ids,
@@ -487,9 +522,14 @@ void interpolate_owned_points(MyList<var> *VarList,
   }
 }
 } // namespace
-
-Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
-{
+
+void patch_release_interp_plan_cache()
+{
+  release_interp_plan_cache_internal();
+}
+
+Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
+{
 
   int hbuffer_width = buffer_width;
   if (lev == 0)
diff --git a/AMSS_NCKU_source/MPatch.h b/AMSS_NCKU_source/MPatch.h
index b993be6..6f231eb 100644
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -8,7 +8,7 @@
 #include "var.h"
 #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
 
-class Patch
+class Patch
 {
 
 public:
@@ -50,6 +50,8 @@ public:
                          double *Shellf, int Symmetry, MPI_Comm Comm_here);
    void Find_Maximum(MyList<var> *VarList, double *XX,
                      double *Shellf, MPI_Comm Comm_here);
-};
-
-#endif /* PATCH_H */
+};
+
+void patch_release_interp_plan_cache();
+
+#endif /* PATCH_H */
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu
index be62fed..bbc2c08 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -48,6 +48,30 @@ struct CachedIntBuffer
   size_t capacity = 0;
 };
 
+inline void release_buffer(CachedBuffer &buffer)
+{
+  if (buffer.ptr)
+  {
+    cudaError_t free_err = cudaFree(buffer.ptr);
+    if (free_err != cudaSuccess)
+      report_cuda_error("cudaFree", free_err);
+    buffer.ptr = nullptr;
+  }
+  buffer.capacity = 0;
+}
+
+inline void release_buffer(CachedIntBuffer &buffer)
+{
+  if (buffer.ptr)
+  {
+    cudaError_t free_err = cudaFree(buffer.ptr);
+    if (free_err != cudaSuccess)
+      report_cuda_error("cudaFree", free_err);
+    buffer.ptr = nullptr;
+  }
+  buffer.capacity = 0;
+}
+
 inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
 {
   if (bytes <= buffer.capacity && buffer.ptr)
@@ -98,6 +122,95 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
   return true;
 }
 
+struct Rk4VarCache
+{
+  CachedBuffer X, Y, Z;
+  CachedBuffer state0, boundary, stage, rhs;
+  const double *host_X = nullptr;
+  const double *host_Y = nullptr;
+  const double *host_Z = nullptr;
+  const double *host_state0 = nullptr;
+  double *host_rhs = nullptr;
+  int nx = 0;
+  int ny = 0;
+  int nz = 0;
+  bool rhs_resident = false;
+};
+
+struct InterpStencilCacheEntry
+{
+  const double *X = nullptr;
+  const double *Y = nullptr;
+  const double *Z = nullptr;
+  const double *px = nullptr;
+  const double *py = nullptr;
+  const double *pz = nullptr;
+  int nx = 0;
+  int ny = 0;
+  int nz = 0;
+  int num_points = 0;
+  int ordn = 0;
+  int symmetry = 0;
+  bool valid = false;
+  CachedBuffer weights;
+  CachedIntBuffer indices;
+  CachedIntBuffer reflect;
+};
+
+struct InterpBatchCache
+{
+  CachedBuffer out;
+  CachedBuffer soa;
+  CachedBuffer field_ptrs;
+  CachedIntBuffer error_flag;
+  std::vector<CachedBuffer> host_field_copies;
+  InterpStencilCacheEntry stencil_entry;
+};
+
+std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
+{
+  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
+  return cache_map;
+}
+
+InterpBatchCache &interp_batch_cache()
+{
+  static thread_local InterpBatchCache cache;
+  return cache;
+}
+
+inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
+{
+  release_buffer(entry.weights);
+  release_buffer(entry.indices);
+  release_buffer(entry.reflect);
+  entry.X = nullptr;
+  entry.Y = nullptr;
+  entry.Z = nullptr;
+  entry.px = nullptr;
+  entry.py = nullptr;
+  entry.pz = nullptr;
+  entry.nx = 0;
+  entry.ny = 0;
+  entry.nz = 0;
+  entry.num_points = 0;
+  entry.ordn = 0;
+  entry.symmetry = 0;
+  entry.valid = false;
+}
+
+inline void release_interp_batch_cache(InterpBatchCache &cache)
+{
+  release_buffer(cache.out);
+  release_buffer(cache.soa);
+  release_buffer(cache.field_ptrs);
+  release_buffer(cache.error_flag);
+  for (size_t i = 0; i < cache.host_field_copies.size(); ++i)
+    release_buffer(cache.host_field_copies[i]);
+  cache.host_field_copies.clear();
+  release_interp_stencil_cache(cache.stencil_entry);
+}
+
 inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
 {
   if (!ensure_capacity(dst, bytes))
@@ -731,22 +844,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                                int rk_stage,
                                bool download_to_host)
 {
-  struct Rk4VarCache
-  {
-    CachedBuffer X, Y, Z;
-    CachedBuffer state0, boundary, stage, rhs;
-    const double *host_X = nullptr;
-    const double *host_Y = nullptr;
-    const double *host_Z = nullptr;
-    const double *host_state0 = nullptr;
-    double *host_rhs = nullptr;
-    int nx = 0;
-    int ny = 0;
-    int nz = 0;
-    bool rhs_resident = false;
-  };
-  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
-  Rk4VarCache &cache = cache_map[state0];
+  Rk4VarCache &cache = rk4_var_cache_map()[state0];
 
   int nx = ex[0];
   int ny = ex[1];
@@ -909,6 +1007,29 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
   return ok ? 0 : 1;
 }
 
+void bssn_cuda_release_rk4_caches()
+{
+  std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
+  for (std::unordered_map<const double *, Rk4VarCache>::iterator it = cache_map.begin();
+       it != cache_map.end(); ++it)
+  {
+    Rk4VarCache &cache = it->second;
+    release_buffer(cache.X);
+    release_buffer(cache.Y);
+    release_buffer(cache.Z);
+    release_buffer(cache.state0);
+    release_buffer(cache.boundary);
+    release_buffer(cache.stage);
+    release_buffer(cache.rhs);
+  }
+  cache_map.clear();
+}
+
+void bssn_cuda_release_interp_caches()
+{
+  release_interp_batch_cache(interp_batch_cache());
+}
+
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
 {
   static thread_local CachedBuffer d_chi;
@@ -988,40 +1109,7 @@ int bssn_cuda_interp_points_batch(const int *ex,
   if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
     return 1;
 
-  struct InterpBatchCache
-  {
-    struct StencilCacheEntry
-    {
-      const double *X;
-      const double *Y;
-      const double *Z;
-      const double *px;
-      const double *py;
-      const double *pz;
-      int nx;
-      int ny;
-      int nz;
-      int num_points;
-      int ordn;
-      int symmetry;
-      CachedBuffer weights;
-      CachedIntBuffer indices;
-      CachedIntBuffer reflect;
-
-      StencilCacheEntry()
-          : X(nullptr), Y(nullptr), Z(nullptr),
-            px(nullptr), py(nullptr), pz(nullptr),
-            nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {}
-    };
-
-    CachedBuffer out;
-    CachedBuffer soa;
-    CachedBuffer field_ptrs;
-    CachedIntBuffer error_flag;
-    std::vector<CachedBuffer> host_field_copies;
-    std::vector<StencilCacheEntry> stencil_entries;
-  };
-  static thread_local InterpBatchCache cache;
+  InterpBatchCache &cache = interp_batch_cache();
 
   const int nx = ex[0];
   const int ny = ex[1];
@@ -1037,37 +1125,31 @@ int bssn_cuda_interp_points_batch(const int *ex,
   const size_t indices_bytes = point_stencil_ints * sizeof(int);
 
   bool ok = true;
-  InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr;
-  for (size_t i = 0; i < cache.stencil_entries.size(); ++i)
-  {
-    InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i];
-    if (entry.X == X && entry.Y == Y && entry.Z == Z &&
-        entry.px == px && entry.py == py && entry.pz == pz &&
-        entry.nx == nx && entry.ny == ny && entry.nz == nz &&
-        entry.num_points == num_points && entry.ordn == ordn &&
-        entry.symmetry == symmetry)
-    {
-      stencil_cache = &entry;
-      break;
-    }
-  }
+  InterpStencilCacheEntry &stencil_cache = cache.stencil_entry;
+  const bool stencil_match =
+      stencil_cache.valid &&
+      stencil_cache.X == X && stencil_cache.Y == Y && stencil_cache.Z == Z &&
+      stencil_cache.px == px && stencil_cache.py == py && stencil_cache.pz == pz &&
+      stencil_cache.nx == nx && stencil_cache.ny == ny && stencil_cache.nz == nz &&
+      stencil_cache.num_points == num_points && stencil_cache.ordn == ordn &&
+      stencil_cache.symmetry == symmetry;
 
-  if (!stencil_cache)
+  if (!stencil_match)
   {
-    cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry());
-    stencil_cache = &cache.stencil_entries.back();
-    stencil_cache->X = X;
-    stencil_cache->Y = Y;
-    stencil_cache->Z = Z;
-    stencil_cache->px = px;
-    stencil_cache->py = py;
-    stencil_cache->pz = pz;
-    stencil_cache->nx = nx;
-    stencil_cache->ny = ny;
-    stencil_cache->nz = nz;
-    stencil_cache->num_points = num_points;
-    stencil_cache->ordn = ordn;
-    stencil_cache->symmetry = symmetry;
+    release_interp_stencil_cache(stencil_cache);
+    stencil_cache.X = X;
+    stencil_cache.Y = Y;
+    stencil_cache.Z = Z;
+    stencil_cache.px = px;
+    stencil_cache.py = py;
+    stencil_cache.pz = pz;
+    stencil_cache.nx = nx;
+    stencil_cache.ny = ny;
+    stencil_cache.nz = nz;
+    stencil_cache.num_points = num_points;
+    stencil_cache.ordn = ordn;
+    stencil_cache.symmetry = symmetry;
+    stencil_cache.valid = true;
 
     std::vector<double> host_weights(point_stencil_doubles);
     std::vector<int> host_indices(point_stencil_ints);
@@ -1104,9 +1186,9 @@ int bssn_cuda_interp_points_batch(const int *ex,
     }
 
     ok = ok &&
-         copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) &&
-         copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) &&
-         copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes);
+         copy_to_device(stencil_cache.weights, host_weights.data(), weights_bytes) &&
+         copy_to_device(stencil_cache.indices, host_indices.data(), indices_bytes) &&
+         copy_to_device(stencil_cache.reflect, host_reflect.data(), indices_bytes);
     if (!ok)
       return 1;
   }
@@ -1159,9 +1241,9 @@ int bssn_cuda_interp_points_batch(const int *ex,
   int ny_local = ny;
   const double *dsoa = cache.soa.ptr;
   const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
-  const double *dweights = stencil_cache->weights.ptr;
-  const int *dindices = stencil_cache->indices.ptr;
-  const int *dreflect = stencil_cache->reflect.ptr;
+  const double *dweights = stencil_cache.weights.ptr;
+  const int *dindices = stencil_cache.indices.ptr;
+  const int *dreflect = stencil_cache.reflect.ptr;
   double *dout = cache.out.ptr;
   int *derror = cache.error_flag.ptr;
 
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h
index 27237d9..7b6456a 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -24,6 +24,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
 
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
 int bssn_cuda_download_buffer(int *ex, double *host_ptr);
+void bssn_cuda_release_rk4_caches();
+void bssn_cuda_release_interp_caches();
 
 int bssn_cuda_prolong3_pack(int wei,
                             const double *llbc, const double *uubc, const int *extc, const double *func,
diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C
index 3f46095..8d34e72 100644
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -29,6 +29,7 @@ using namespace std;
 #include "parameters.h"
 #ifdef USE_GPU
 #include "bssn_gpu.h"
+#include "bssn_cuda_ops.h"
 #endif
 
 //================================================================================================
@@ -891,6 +892,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
 #ifdef USE_GPU
       bssn_gpu_clear_cached_device_buffers();
       bssn_gpu_release_pinned_host_buffers();
+      bssn_cuda_release_rk4_caches();
+      bssn_cuda_release_interp_caches();
+      patch_release_interp_plan_cache();
 #endif
       Parallel::KillBlocks(PatL[lev]);
       PatL[lev]->destroyList();
@@ -924,6 +928,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
 #ifdef USE_GPU
       bssn_gpu_clear_cached_device_buffers();
       bssn_gpu_release_pinned_host_buffers();
+      bssn_cuda_release_rk4_caches();
+      bssn_cuda_release_interp_caches();
+      patch_release_interp_plan_cache();
 #endif
       Parallel::KillBlocks(PatL[lev]);
       PatL[lev]->destroyList();
@@ -1536,6 +1543,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
 #ifdef USE_GPU
   bssn_gpu_clear_cached_device_buffers();
   bssn_gpu_release_pinned_host_buffers();
+  bssn_cuda_release_rk4_caches();
+  bssn_cuda_release_interp_caches();
+  patch_release_interp_plan_cache();
 #endif
   Parallel::KillBlocks(PatL[lev]);
   PatL[lev]->destroyList();
@@ -1563,6 +1573,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
 #ifdef USE_GPU
   bssn_gpu_clear_cached_device_buffers();
   bssn_gpu_release_pinned_host_buffers();
+  bssn_cuda_release_rk4_caches();
+  bssn_cuda_release_interp_caches();
+  patch_release_interp_plan_cache();
 #endif
   Parallel::KillBlocks(PatL[lev]);
   PatL[lev]->destroyList();