From cf3c6d62181ee310e657ba51ba6504c60427430e Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Thu, 9 Apr 2026 20:48:06 +0800
Subject: [PATCH] Stabilize GPU buffer lifecycle around regrid

---
 AMSS_NCKU_source/Block.C          | 26 ++++++----
 AMSS_NCKU_source/bssn_class.C     | 82 ++++++++++++++++++++---------
 AMSS_NCKU_source/bssn_class.h     | 10 ++--
 AMSS_NCKU_source/bssn_cuda_ops.cu |  7 ++-
 AMSS_NCKU_source/bssn_gpu.cu      | 34 +++++++++++--
 AMSS_NCKU_source/bssn_gpu.h       |  1 +
 AMSS_NCKU_source/bssn_rhs_c.C     | 22 ++++++--
 AMSS_NCKU_source/cgh.C            | 85 +++++++++++++++++++------------
 8 files changed, 186 insertions(+), 81 deletions(-)
diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C
index fcae198..b875b27 100644
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -9,8 +9,11 @@
 #include <new>
 using namespace std;
 
-#include "Block.h"
-#include "misc.h"
+#include "Block.h"
+#include "misc.h"
+#ifdef USE_GPU
+#include "bssn_gpu.h"
+#endif
 
 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
 {
@@ -95,14 +98,17 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
   }
 #endif
 }
-Block::~Block()
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == rank)
-  {
-    for (int i = 0; i < dim; i++)
-      delete[] X[i];
+Block::~Block()
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == rank)
+  {
+#ifdef USE_GPU
+    bssn_gpu_clear_cached_device_buffers();
+#endif
+    for (int i = 0; i < dim; i++)
+      delete[] X[i];
     for (int i = 0; i < ingfs; i++)
       free(igfs[i]);
     delete[] igfs;
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index b571eaa..a893fb9 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -745,11 +745,12 @@ void bssn_class::Initialize()
   // Initialize sync caches (per-level, for predictor and corrector)
   sync_cache_pre = new Parallel::SyncCache[GH->levels];
   sync_cache_cor = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
-  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
-  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
-}
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
+  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
+  sync_cache_psi4 = new Parallel::SyncCache[GH->levels];
+}
 
 //================================================================================================
 
@@ -761,8 +762,8 @@ void bssn_class::Initialize()
 
 //================================================================================================
 
-bssn_class::~bssn_class()
-{
+bssn_class::~bssn_class()
+{
 #ifdef With_AHF
   AHList->clearList();
   AHDList->clearList();
@@ -1019,12 +1020,30 @@ bssn_class::~bssn_class()
       sync_cache_rp_coarse[i].destroy();
     delete[] sync_cache_rp_coarse;
   }
-  if (sync_cache_rp_fine)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_rp_fine[i].destroy();
-    delete[] sync_cache_rp_fine;
-  }
+  if (sync_cache_rp_fine)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_rp_fine[i].destroy();
+    delete[] sync_cache_rp_fine;
+  }
+  if (sync_cache_restrict)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_restrict[i].destroy();
+    delete[] sync_cache_restrict;
+  }
+  if (sync_cache_outbd)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_outbd[i].destroy();
+    delete[] sync_cache_outbd;
+  }
+  if (sync_cache_psi4)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_psi4[i].destroy();
+    delete[] sync_cache_psi4;
+  }
 
   delete GH;
 #ifdef WithShell
@@ -1057,8 +1076,25 @@ bssn_class::~bssn_class()
   delete ConVMonitor;
   delete Waveshell;
 
-  delete CheckPoint;
-}
+  delete CheckPoint;
+}
+
+void bssn_class::InvalidateSyncCaches()
+{
+  if (!GH)
+    return;
+
+  for (int il = 0; il < GH->levels; il++)
+  {
+    sync_cache_pre[il].invalidate();
+    sync_cache_cor[il].invalidate();
+    sync_cache_rp_coarse[il].invalidate();
+    sync_cache_rp_fine[il].invalidate();
+    sync_cache_restrict[il].invalidate();
+    sync_cache_outbd[il].invalidate();
+    sync_cache_psi4[il].invalidate();
+  }
+}
 
 //================================================================================================
 
@@ -2229,7 +2265,7 @@ void bssn_class::Evolve(int Steps)
     GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
                SynchList_cor, OldStateList, StateList, SynchList_pre,
                fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+    InvalidateSyncCaches();
 #endif
 
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2450,7 +2486,7 @@ void bssn_class::RecursiveStep(int lev)
   if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                       SynchList_cor, OldStateList, StateList, SynchList_pre,
                       fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+  InvalidateSyncCaches();
 #endif
 }
 
@@ -2629,7 +2665,7 @@ void bssn_class::ParallelStep()
   if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                       SynchList_cor, OldStateList, StateList, SynchList_pre,
                       fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+  InvalidateSyncCaches();
 #endif
 }
 
@@ -2796,7 +2832,7 @@ void bssn_class::ParallelStep()
         if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                             SynchList_cor, OldStateList, StateList, SynchList_pre,
                             fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+        InvalidateSyncCaches();
 
         //               a_stream.clear();
         //               a_stream.str("");
@@ -2811,7 +2847,7 @@ void bssn_class::ParallelStep()
       if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                           SynchList_cor, OldStateList, StateList, SynchList_pre,
                           fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+      InvalidateSyncCaches();
 
       //               a_stream.clear();
       //               a_stream.str("");
@@ -2830,7 +2866,7 @@ void bssn_class::ParallelStep()
           if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                               SynchList_cor, OldStateList, StateList, SynchList_pre,
                               fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+          InvalidateSyncCaches();
 
           //               a_stream.clear();
           //               a_stream.str("");
@@ -2846,7 +2882,7 @@ void bssn_class::ParallelStep()
           if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                               SynchList_cor, OldStateList, StateList, SynchList_pre,
                               fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+          InvalidateSyncCaches();
 
           //               a_stream.clear();
           //               a_stream.str("");
@@ -6262,7 +6298,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
 for(int ilev=GH->levels-1;ilev>lev;ilev--)
     RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
 #else
-  Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4");
+  Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]);
 #endif
 
 #ifdef WithShell
diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h
index 94fd306..c004c06 100644
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -128,10 +128,11 @@ public:
 
        Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
        Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
-       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
-       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
-       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
-       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
+       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
+       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
+       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
+       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
+       Parallel::SyncCache *sync_cache_psi4;       // cached Psi4 sync on PatL[lev]
 
        monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
        monitor *ConVMonitor;
@@ -176,6 +177,7 @@ public:
        virtual void Initialize();
        virtual void Read_Ansorg();
        virtual void Read_Pablo() {};
+       void InvalidateSyncCaches();
        virtual void Compute_Psi4(int lev);
        virtual void Step(int lev, int YN);
 #ifdef USE_GPU
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu
index f619d15..45fb588 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -1201,9 +1201,9 @@ int bssn_cuda_prolong3_pack(int wei,
   if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
     return 1;
 
-  // The current input runs with equatorial symmetry enabled.
-  // The symmetry-aware prolong CUDA path is not numerically stable yet,
-  // so force a safe fallback to the original Fortran implementation.
+  // The symmetry-aware prolong CUDA path is still not equivalent to the
+  // active Cell/ghost_width=3 Fortran implementation, so keep the safe
+  // fallback for all symmetry-enabled cases.
   if (symmetry != 0)
     return 1;
 
@@ -1276,7 +1276,6 @@ int bssn_cuda_prolong3_pack(int wei,
 
   // Current CUDA prolong path only supports the same fast path as the
   // optimized Fortran code: interior stencil access without symmetry_bd().
-  // If the stencil touches the symmetry boundary, fall back to Fortran.
   if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
     return 1;
 
diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu
index 78c0429..22bc8dc 100644
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -135,7 +135,7 @@ struct GpuRhsCache
 	const double *last_y = nullptr;
 	const double *last_z = nullptr;
 	bool meta_uploaded = false;
-	static const int max_mapped_buffers = 128;
+	static const int max_mapped_buffers = 512;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	const double *device_buffers[max_mapped_buffers] = {nullptr};
 	int mapped_buffer_count = 0;
@@ -143,7 +143,7 @@ struct GpuRhsCache
 
 struct ExternalBufferRegistry
 {
-	static const int max_mapped_buffers = 256;
+	static const int max_mapped_buffers = 4096;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	const double *device_buffers[max_mapped_buffers] = {nullptr};
 	int mapped_buffer_count = 0;
@@ -151,7 +151,7 @@ struct ExternalBufferRegistry
 
 struct OwnedBufferRegistry
 {
-	static const int max_mapped_buffers = 256;
+	static const int max_mapped_buffers = 4096;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	double *device_buffers[max_mapped_buffers] = {nullptr};
 	size_t capacities[max_mapped_buffers] = {0};
@@ -223,7 +223,11 @@ void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device
 	}
 
 	if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
+	{
+		cerr << "gpu RHS buffer registry exhausted at " << GpuRhsCache::max_mapped_buffers
+		     << " entries" << endl;
 		return;
+	}
 
 	cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
 	cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
@@ -255,7 +259,11 @@ void map_external_buffer(ExternalBufferRegistry &registry, const double *host_pt
 	}
 
 	if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
+	{
+		cerr << "external CUDA buffer registry exhausted at "
+		     << ExternalBufferRegistry::max_mapped_buffers << " entries" << endl;
 		return;
+	}
 
 	registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
 	registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
@@ -421,6 +429,7 @@ void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
 		return;
 	}
 
+	cerr << "cudaHostRegister failed: " << cudaGetErrorString(err) << endl;
 	registry.failed[slot] = true;
 	registry.capacities[slot] = bytes;
 }
@@ -932,6 +941,25 @@ void bssn_gpu_clear_cached_device_buffers()
 	invalidate_owned_buffer_map(owned_buffer_registry());
 }
 
+void bssn_gpu_release_pinned_host_buffers()
+{
+	PinnedHostRegistry &pinned = pinned_host_registry();
+	for (int i = 0; i < pinned.buffer_count; ++i)
+	{
+		if (pinned.registered[i] && pinned.host_buffers[i])
+		{
+			cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
+			if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
+				cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
+		}
+		pinned.host_buffers[i] = nullptr;
+		pinned.capacities[i] = 0;
+		pinned.registered[i] = false;
+		pinned.failed[i] = false;
+	}
+	pinned.buffer_count = 0;
+}
+
 void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
 {
 	map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h
index bb1e50e..5a3337d 100644
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -67,6 +67,7 @@ int gpu_rhs_ss(RHS_SS_PARA);
 
 int bssn_gpu_bind_process_device(int mpi_rank);
 void bssn_gpu_clear_cached_device_buffers();
+void bssn_gpu_release_pinned_host_buffers();
 const double *bssn_gpu_find_device_buffer(const double *host_ptr);
 void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
 void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C
index 4354866..be569b0 100644
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -1022,9 +1022,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
                         + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
 
             #if (GAUGE == 2)
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
+            {
+                const double chi_sqrt = sqrt(chin1[i]);
+                const double damping = ONE - chi_sqrt;
+                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
+            }
             #else
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
+            {
+                const double damping = ONE - chin1[i];
+                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
+            }
             #endif
 
             dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1040,9 +1047,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
                         + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
 
             #if (GAUGE == 4)
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
+            {
+                const double chi_sqrt = sqrt(chin1[i]);
+                const double damping = ONE - chi_sqrt;
+                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
+            }
             #else
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
+            {
+                const double damping = ONE - chin1[i];
+                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
+            }
             #endif
 
             betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C
index 6e60f68..3f46095 100644
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -23,10 +23,13 @@ using namespace std;
 #include <mpi.h>
 
 #include "macrodef.h"
-#include "misc.h"
-#include "cgh.h"
-#include "Parallel.h"
-#include "parameters.h"
+#include "misc.h"
+#include "cgh.h"
+#include "Parallel.h"
+#include "parameters.h"
+#ifdef USE_GPU
+#include "bssn_gpu.h"
+#endif
 
 //================================================================================================
 
@@ -881,13 +884,17 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
       tmPat = construct_patchlist(lev, Symmetry);
       // tmPat construction completes
       Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+#ifdef USE_GPU
+      bssn_gpu_clear_cached_device_buffers();
+      bssn_gpu_release_pinned_host_buffers();
+#endif
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
       Parallel::destroypsuList_bam(bdsul[lev]);
       Parallel::destroypsuList_bam(rsul[lev]);
@@ -910,13 +917,17 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
       tmPat = construct_patchlist(lev, Symmetry);
       // tmPat construction completes
       Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+#ifdef USE_GPU
+      bssn_gpu_clear_cached_device_buffers();
+      bssn_gpu_release_pinned_host_buffers();
+#endif
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
 #error "not support yet"
 #endif
@@ -1518,13 +1529,17 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
   tmPat = construct_patchlist(lev, Symmetry);
   // tmPat construction completes
   Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+#ifdef USE_GPU
+  bssn_gpu_clear_cached_device_buffers();
+  bssn_gpu_release_pinned_host_buffers();
+#endif
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1540,14 +1555,18 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
   // tmPat construction completes
   Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
   misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
-
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
+
+#ifdef USE_GPU
+  bssn_gpu_clear_cached_device_buffers();
+  bssn_gpu_release_pinned_host_buffers();
+#endif
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }