From c578a15ecde5f1468ded844d5fcec4e6b2672f18 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Fri, 10 Apr 2026 10:29:04 +0800 Subject: [PATCH] Fix GPU interpolation cache lifetime leaks --- AMSS_NCKU_source/Block.C | 3 + AMSS_NCKU_source/MPatch.C | 60 +++++-- AMSS_NCKU_source/MPatch.h | 10 +- AMSS_NCKU_source/bssn_cuda_ops.cu | 252 ++++++++++++++++++++---------- AMSS_NCKU_source/bssn_cuda_ops.h | 2 + AMSS_NCKU_source/cgh.C | 13 ++ 6 files changed, 241 insertions(+), 99 deletions(-) diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C index b875b27..e921cfa 100644 --- a/AMSS_NCKU_source/Block.C +++ b/AMSS_NCKU_source/Block.C @@ -13,6 +13,7 @@ using namespace std; #include "misc.h" #ifdef USE_GPU #include "bssn_gpu.h" +#include "bssn_cuda_ops.h" #endif Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) @@ -106,6 +107,8 @@ Block::~Block() { #ifdef USE_GPU bssn_gpu_clear_cached_device_buffers(); + bssn_cuda_release_rk4_caches(); + bssn_cuda_release_interp_caches(); #endif for (int i = 0; i < dim; i++) delete[] X[i]; diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 68ac732..29680d8 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -79,6 +79,15 @@ struct CachedInterpPlan CachedInterpPlan() : nblocks(0) {} }; +struct CachedInterpPlanEntry +{ + bool valid; + InterpPlanKey key; + CachedInterpPlan plan; + + CachedInterpPlanEntry() : valid(false) {} +}; + struct InterpBlockView { Block *bp; @@ -268,6 +277,23 @@ bool should_try_cuda_interp(int ordn, int num_points, int num_var) return num_points * num_var >= 256; } +CachedInterpPlanEntry &interp_plan_cache_entry() +{ + static CachedInterpPlanEntry cache; + return cache; +} + +bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs) +{ + return lhs.patch == rhs.patch && + lhs.x == rhs.x && + lhs.y == rhs.y && + lhs.z == rhs.z && + lhs.NN == rhs.NN && + lhs.Symmetry == rhs.Symmetry && + lhs.myrank == rhs.myrank; +} + CachedInterpPlan &get_cached_interp_plan(Patch *patch, int NN, double **XX, int Symmetry, int myrank, @@ -276,8 +302,6 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch, bool report_bounds_here, bool allow_missing_points) { - static map cache; - InterpPlanKey key; key.patch = patch; key.x = XX[0]; @@ -287,12 +311,16 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch, key.Symmetry = Symmetry; key.myrank = myrank; - map::iterator it = cache.find(key); - if (it != cache.end() && it->second.nblocks == static_cast(block_index.views.size())) - return it->second; + CachedInterpPlanEntry &cache = interp_plan_cache_entry(); + if (cache.valid && + same_interp_plan_key(cache.key, key) && + cache.plan.nblocks == static_cast(block_index.views.size())) + return cache.plan; - CachedInterpPlan &plan = cache[key]; - plan = CachedInterpPlan(); + cache.valid = true; + cache.key = key; + cache.plan = CachedInterpPlan(); + CachedInterpPlan &plan = cache.plan; plan.nblocks = static_cast(block_index.views.size()); plan.owner_rank.assign(NN, -1); plan.owner_block.assign(NN, -1); @@ -380,6 +408,13 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch, return plan; } +void release_interp_plan_cache_internal() +{ + CachedInterpPlanEntry &cache = interp_plan_cache_entry(); + cache.valid = false; + cache.plan = CachedInterpPlan(); +} + bool run_cuda_interp_for_block(Block *BP, const vector &vars, const vector &point_ids, @@ -487,9 +522,14 @@ void interpolate_owned_points(MyList *VarList, } } } // namespace - -Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) -{ + +void patch_release_interp_plan_cache() +{ + release_interp_plan_cache_internal(); +} + +Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) +{ int hbuffer_width = buffer_width; if (lev == 0) diff --git a/AMSS_NCKU_source/MPatch.h b/AMSS_NCKU_source/MPatch.h index b993be6..6f231eb 100644 --- a/AMSS_NCKU_source/MPatch.h +++ b/AMSS_NCKU_source/MPatch.h @@ -8,7 +8,7 @@ #include "var.h" #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width -class Patch +class Patch { public: @@ -50,6 +50,8 @@ public: double *Shellf, int Symmetry, MPI_Comm Comm_here); void Find_Maximum(MyList *VarList, double *XX, double *Shellf, MPI_Comm Comm_here); -}; - -#endif /* PATCH_H */ +}; + +void patch_release_interp_plan_cache(); + +#endif /* PATCH_H */ diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index be62fed..bbc2c08 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -48,6 +48,30 @@ struct CachedIntBuffer size_t capacity = 0; }; +inline void release_buffer(CachedBuffer &buffer) +{ + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + } + buffer.capacity = 0; +} + +inline void release_buffer(CachedIntBuffer &buffer) +{ + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + } + buffer.capacity = 0; +} + inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) { if (bytes <= buffer.capacity && buffer.ptr) @@ -98,6 +122,95 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes) return true; } +struct Rk4VarCache +{ + CachedBuffer X, Y, Z; + CachedBuffer state0, boundary, stage, rhs; + const double *host_X = nullptr; + const double *host_Y = nullptr; + const double *host_Z = nullptr; + const double *host_state0 = nullptr; + double *host_rhs = nullptr; + int nx = 0; + int ny = 0; + int nz = 0; + bool rhs_resident = false; +}; + +struct InterpStencilCacheEntry +{ + const double *X = nullptr; + const double *Y = nullptr; + const double *Z = nullptr; + const double *px = nullptr; + const double *py = nullptr; + const double *pz = nullptr; + int nx = 0; + int ny = 0; + int nz = 0; + int num_points = 0; + int ordn = 0; + int symmetry = 0; + bool valid = false; + CachedBuffer weights; + CachedIntBuffer indices; + CachedIntBuffer reflect; +}; + +struct InterpBatchCache +{ + CachedBuffer out; + CachedBuffer soa; + CachedBuffer field_ptrs; + CachedIntBuffer error_flag; + std::vector host_field_copies; + InterpStencilCacheEntry stencil_entry; +}; + +std::unordered_map &rk4_var_cache_map() +{ + static thread_local std::unordered_map cache_map; + return cache_map; +} + +InterpBatchCache &interp_batch_cache() +{ + static thread_local InterpBatchCache cache; + return cache; +} + +inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry) +{ + release_buffer(entry.weights); + release_buffer(entry.indices); + release_buffer(entry.reflect); + entry.X = nullptr; + entry.Y = nullptr; + entry.Z = nullptr; + entry.px = nullptr; + entry.py = nullptr; + entry.pz = nullptr; + entry.nx = 0; + entry.ny = 0; + entry.nz = 0; + entry.num_points = 0; + entry.ordn = 0; + entry.symmetry = 0; + entry.valid = false; +} + +inline void release_interp_batch_cache(InterpBatchCache &cache) +{ + release_buffer(cache.out); + release_buffer(cache.soa); + release_buffer(cache.field_ptrs); + release_buffer(cache.error_flag); + for (size_t i = 0; i < cache.host_field_copies.size(); ++i) + release_buffer(cache.host_field_copies[i]); + cache.host_field_copies.clear(); + release_interp_stencil_cache(cache.stencil_entry); +} + inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes) { if (!ensure_capacity(dst, bytes)) @@ -731,22 +844,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int rk_stage, bool download_to_host) { - struct Rk4VarCache - { - CachedBuffer X, Y, Z; - CachedBuffer state0, boundary, stage, rhs; - const double *host_X = nullptr; - const double *host_Y = nullptr; - const double *host_Z = nullptr; - const double *host_state0 = nullptr; - double *host_rhs = nullptr; - int nx = 0; - int ny = 0; - int nz = 0; - bool rhs_resident = false; - }; - static thread_local std::unordered_map cache_map; - Rk4VarCache &cache = cache_map[state0]; + Rk4VarCache &cache = rk4_var_cache_map()[state0]; int nx = ex[0]; int ny = ex[1]; @@ -909,6 +1007,29 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, return ok ? 0 : 1; } +void bssn_cuda_release_rk4_caches() +{ + std::unordered_map &cache_map = rk4_var_cache_map(); + for (std::unordered_map::iterator it = cache_map.begin(); + it != cache_map.end(); ++it) + { + Rk4VarCache &cache = it->second; + release_buffer(cache.X); + release_buffer(cache.Y); + release_buffer(cache.Z); + release_buffer(cache.state0); + release_buffer(cache.boundary); + release_buffer(cache.stage); + release_buffer(cache.rhs); + } + cache_map.clear(); +} + +void bssn_cuda_release_interp_caches() +{ + release_interp_batch_cache(interp_batch_cache()); +} + int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host) { static thread_local CachedBuffer d_chi; @@ -988,40 +1109,7 @@ int bssn_cuda_interp_points_batch(const int *ex, if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn) return 1; - struct InterpBatchCache - { - struct StencilCacheEntry - { - const double *X; - const double *Y; - const double *Z; - const double *px; - const double *py; - const double *pz; - int nx; - int ny; - int nz; - int num_points; - int ordn; - int symmetry; - CachedBuffer weights; - CachedIntBuffer indices; - CachedIntBuffer reflect; - - StencilCacheEntry() - : X(nullptr), Y(nullptr), Z(nullptr), - px(nullptr), py(nullptr), pz(nullptr), - nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {} - }; - - CachedBuffer out; - CachedBuffer soa; - CachedBuffer field_ptrs; - CachedIntBuffer error_flag; - std::vector host_field_copies; - std::vector stencil_entries; - }; - static thread_local InterpBatchCache cache; + InterpBatchCache &cache = interp_batch_cache(); const int nx = ex[0]; const int ny = ex[1]; @@ -1037,37 +1125,31 @@ int bssn_cuda_interp_points_batch(const int *ex, const size_t indices_bytes = point_stencil_ints * sizeof(int); bool ok = true; - InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr; - for (size_t i = 0; i < cache.stencil_entries.size(); ++i) - { - InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i]; - if (entry.X == X && entry.Y == Y && entry.Z == Z && - entry.px == px && entry.py == py && entry.pz == pz && - entry.nx == nx && entry.ny == ny && entry.nz == nz && - entry.num_points == num_points && entry.ordn == ordn && - entry.symmetry == symmetry) - { - stencil_cache = &entry; - break; - } - } + InterpStencilCacheEntry &stencil_cache = cache.stencil_entry; + const bool stencil_match = + stencil_cache.valid && + stencil_cache.X == X && stencil_cache.Y == Y && stencil_cache.Z == Z && + stencil_cache.px == px && stencil_cache.py == py && stencil_cache.pz == pz && + stencil_cache.nx == nx && stencil_cache.ny == ny && stencil_cache.nz == nz && + stencil_cache.num_points == num_points && stencil_cache.ordn == ordn && + stencil_cache.symmetry == symmetry; - if (!stencil_cache) + if (!stencil_match) { - cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry()); - stencil_cache = &cache.stencil_entries.back(); - stencil_cache->X = X; - stencil_cache->Y = Y; - stencil_cache->Z = Z; - stencil_cache->px = px; - stencil_cache->py = py; - stencil_cache->pz = pz; - stencil_cache->nx = nx; - stencil_cache->ny = ny; - stencil_cache->nz = nz; - stencil_cache->num_points = num_points; - stencil_cache->ordn = ordn; - stencil_cache->symmetry = symmetry; + release_interp_stencil_cache(stencil_cache); + stencil_cache.X = X; + stencil_cache.Y = Y; + stencil_cache.Z = Z; + stencil_cache.px = px; + stencil_cache.py = py; + stencil_cache.pz = pz; + stencil_cache.nx = nx; + stencil_cache.ny = ny; + stencil_cache.nz = nz; + stencil_cache.num_points = num_points; + stencil_cache.ordn = ordn; + stencil_cache.symmetry = symmetry; + stencil_cache.valid = true; std::vector host_weights(point_stencil_doubles); std::vector host_indices(point_stencil_ints); @@ -1104,9 +1186,9 @@ int bssn_cuda_interp_points_batch(const int *ex, } ok = ok && - copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) && - copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) && - copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes); + copy_to_device(stencil_cache.weights, host_weights.data(), weights_bytes) && + copy_to_device(stencil_cache.indices, host_indices.data(), indices_bytes) && + copy_to_device(stencil_cache.reflect, host_reflect.data(), indices_bytes); if (!ok) return 1; } @@ -1159,9 +1241,9 @@ int bssn_cuda_interp_points_batch(const int *ex, int ny_local = ny; const double *dsoa = cache.soa.ptr; const double *const *dfields = reinterpret_cast(cache.field_ptrs.ptr); - const double *dweights = stencil_cache->weights.ptr; - const int *dindices = stencil_cache->indices.ptr; - const int *dreflect = stencil_cache->reflect.ptr; + const double *dweights = stencil_cache.weights.ptr; + const int *dindices = stencil_cache.indices.ptr; + const int *dreflect = stencil_cache.reflect.ptr; double *dout = cache.out.ptr; int *derror = cache.error_flag.ptr; diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index 27237d9..7b6456a 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -24,6 +24,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true); int bssn_cuda_download_buffer(int *ex, double *host_ptr); +void bssn_cuda_release_rk4_caches(); +void bssn_cuda_release_interp_caches(); int bssn_cuda_prolong3_pack(int wei, const double *llbc, const double *uubc, const int *extc, const double *func, diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C index 3f46095..8d34e72 100644 --- a/AMSS_NCKU_source/cgh.C +++ b/AMSS_NCKU_source/cgh.C @@ -29,6 +29,7 @@ using namespace std; #include "parameters.h" #ifdef USE_GPU #include "bssn_gpu.h" +#include "bssn_cuda_ops.h" #endif //================================================================================================ @@ -891,6 +892,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag, #ifdef USE_GPU bssn_gpu_clear_cached_device_buffers(); bssn_gpu_release_pinned_host_buffers(); + bssn_cuda_release_rk4_caches(); + bssn_cuda_release_interp_caches(); + patch_release_interp_plan_cache(); #endif Parallel::KillBlocks(PatL[lev]); PatL[lev]->destroyList(); @@ -924,6 +928,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag, #ifdef USE_GPU bssn_gpu_clear_cached_device_buffers(); bssn_gpu_release_pinned_host_buffers(); + bssn_cuda_release_rk4_caches(); + bssn_cuda_release_interp_caches(); + patch_release_interp_plan_cache(); #endif Parallel::KillBlocks(PatL[lev]); PatL[lev]->destroyList(); @@ -1536,6 +1543,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev, #ifdef USE_GPU bssn_gpu_clear_cached_device_buffers(); bssn_gpu_release_pinned_host_buffers(); + bssn_cuda_release_rk4_caches(); + bssn_cuda_release_interp_caches(); + patch_release_interp_plan_cache(); #endif Parallel::KillBlocks(PatL[lev]); PatL[lev]->destroyList(); @@ -1563,6 +1573,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev, #ifdef USE_GPU bssn_gpu_clear_cached_device_buffers(); bssn_gpu_release_pinned_host_buffers(); + bssn_cuda_release_rk4_caches(); + bssn_cuda_release_interp_caches(); + patch_release_interp_plan_cache(); #endif Parallel::KillBlocks(PatL[lev]); PatL[lev]->destroyList();