From cf3c6d62181ee310e657ba51ba6504c60427430e Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 20:48:06 +0800 Subject: [PATCH] Stabilize GPU buffer lifecycle around regrid --- AMSS_NCKU_source/Block.C | 26 ++++++---- AMSS_NCKU_source/bssn_class.C | 82 ++++++++++++++++++++--------- AMSS_NCKU_source/bssn_class.h | 10 ++-- AMSS_NCKU_source/bssn_cuda_ops.cu | 7 ++- AMSS_NCKU_source/bssn_gpu.cu | 34 +++++++++++-- AMSS_NCKU_source/bssn_gpu.h | 1 + AMSS_NCKU_source/bssn_rhs_c.C | 22 ++++++-- AMSS_NCKU_source/cgh.C | 85 +++++++++++++++++++------------ 8 files changed, 186 insertions(+), 81 deletions(-) diff --git a/AMSS_NCKU_source/Block.C b/AMSS_NCKU_source/Block.C index fcae198..b875b27 100644 --- a/AMSS_NCKU_source/Block.C +++ b/AMSS_NCKU_source/Block.C @@ -9,8 +9,11 @@ #include using namespace std; -#include "Block.h" -#include "misc.h" +#include "Block.h" +#include "misc.h" +#ifdef USE_GPU +#include "bssn_gpu.h" +#endif Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui) { @@ -95,14 +98,17 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng } #endif } -Block::~Block() -{ - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - if (myrank == rank) - { - for (int i = 0; i < dim; i++) - delete[] X[i]; +Block::~Block() +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == rank) + { +#ifdef USE_GPU + bssn_gpu_clear_cached_device_buffers(); +#endif + for (int i = 0; i < dim; i++) + delete[] X[i]; for (int i = 0; i < ingfs; i++) free(igfs[i]); delete[] igfs; diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index b571eaa..a893fb9 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -745,11 +745,12 @@ void bssn_class::Initialize() // Initialize sync caches (per-level, for predictor and corrector) sync_cache_pre = new Parallel::SyncCache[GH->levels]; sync_cache_cor = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; - sync_cache_restrict = new Parallel::SyncCache[GH->levels]; - sync_cache_outbd = new Parallel::SyncCache[GH->levels]; -} + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; + sync_cache_psi4 = new Parallel::SyncCache[GH->levels]; +} //================================================================================================ @@ -761,8 +762,8 @@ void bssn_class::Initialize() //================================================================================================ -bssn_class::~bssn_class() -{ +bssn_class::~bssn_class() +{ #ifdef With_AHF AHList->clearList(); AHDList->clearList(); @@ -1019,12 +1020,30 @@ bssn_class::~bssn_class() sync_cache_rp_coarse[i].destroy(); delete[] sync_cache_rp_coarse; } - if (sync_cache_rp_fine) - { - for (int i = 0; i < GH->levels; i++) - sync_cache_rp_fine[i].destroy(); - delete[] sync_cache_rp_fine; - } + if (sync_cache_rp_fine) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_rp_fine[i].destroy(); + delete[] sync_cache_rp_fine; + } + if (sync_cache_restrict) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_restrict[i].destroy(); + delete[] sync_cache_restrict; + } + if (sync_cache_outbd) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_outbd[i].destroy(); + delete[] sync_cache_outbd; + } + if (sync_cache_psi4) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_psi4[i].destroy(); + delete[] sync_cache_psi4; + } delete GH; #ifdef WithShell @@ -1057,8 +1076,25 @@ bssn_class::~bssn_class() delete ConVMonitor; delete Waveshell; - delete CheckPoint; -} + delete CheckPoint; +} + +void bssn_class::InvalidateSyncCaches() +{ + if (!GH) + return; + + for (int il = 0; il < GH->levels; il++) + { + sync_cache_pre[il].invalidate(); + sync_cache_cor[il].invalidate(); + sync_cache_rp_coarse[il].invalidate(); + sync_cache_rp_fine[il].invalidate(); + sync_cache_restrict[il].invalidate(); + sync_cache_outbd[il].invalidate(); + sync_cache_psi4[il].invalidate(); + } +} //================================================================================================ @@ -2229,7 +2265,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2450,7 +2486,7 @@ void bssn_class::RecursiveStep(int lev) if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); #endif } @@ -2629,7 +2665,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); #endif } @@ -2796,7 +2832,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); // a_stream.clear(); // a_stream.str(""); @@ -2811,7 +2847,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); // a_stream.clear(); // a_stream.str(""); @@ -2830,7 +2866,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); // a_stream.clear(); // a_stream.str(""); @@ -2846,7 +2882,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + InvalidateSyncCaches(); // a_stream.clear(); // a_stream.str(""); @@ -6262,7 +6298,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--) for(int ilev=GH->levels-1;ilev>lev;ilev--) RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List); #else - Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4"); + Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]); #endif #ifdef WithShell diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 94fd306..c004c06 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -128,10 +128,11 @@ public: Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync - Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] - Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] - Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong - Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong + Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] + Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] + Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong + Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong + Parallel::SyncCache *sync_cache_psi4; // cached Psi4 sync on PatL[lev] monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ConVMonitor; @@ -176,6 +177,7 @@ public: virtual void Initialize(); virtual void Read_Ansorg(); virtual void Read_Pablo() {}; + void InvalidateSyncCaches(); virtual void Compute_Psi4(int lev); virtual void Step(int lev, int YN); #ifdef USE_GPU diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index f619d15..45fb588 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -1201,9 +1201,9 @@ int bssn_cuda_prolong3_pack(int wei, if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA) return 1; - // The current input runs with equatorial symmetry enabled. - // The symmetry-aware prolong CUDA path is not numerically stable yet, - // so force a safe fallback to the original Fortran implementation. + // The symmetry-aware prolong CUDA path is still not equivalent to the + // active Cell/ghost_width=3 Fortran implementation, so keep the safe + // fallback for all symmetry-enabled cases. if (symmetry != 0) return 1; @@ -1276,7 +1276,6 @@ int bssn_cuda_prolong3_pack(int wei, // Current CUDA prolong path only supports the same fast path as the // optimized Fortran code: interior stencil access without symmetry_bd(). - // If the stencil touches the symmetry boundary, fall back to Fortran. if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1) return 1; diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 78c0429..22bc8dc 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -135,7 +135,7 @@ struct GpuRhsCache const double *last_y = nullptr; const double *last_z = nullptr; bool meta_uploaded = false; - static const int max_mapped_buffers = 128; + static const int max_mapped_buffers = 512; const double *host_buffers[max_mapped_buffers] = {nullptr}; const double *device_buffers[max_mapped_buffers] = {nullptr}; int mapped_buffer_count = 0; @@ -143,7 +143,7 @@ struct GpuRhsCache struct ExternalBufferRegistry { - static const int max_mapped_buffers = 256; + static const int max_mapped_buffers = 4096; const double *host_buffers[max_mapped_buffers] = {nullptr}; const double *device_buffers[max_mapped_buffers] = {nullptr}; int mapped_buffer_count = 0; @@ -151,7 +151,7 @@ struct ExternalBufferRegistry struct OwnedBufferRegistry { - static const int max_mapped_buffers = 256; + static const int max_mapped_buffers = 4096; const double *host_buffers[max_mapped_buffers] = {nullptr}; double *device_buffers[max_mapped_buffers] = {nullptr}; size_t capacities[max_mapped_buffers] = {0}; @@ -223,7 +223,11 @@ void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device } if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers) + { + cerr << "gpu RHS buffer registry exhausted at " << GpuRhsCache::max_mapped_buffers + << " entries" << endl; return; + } cache.host_buffers[cache.mapped_buffer_count] = host_ptr; cache.device_buffers[cache.mapped_buffer_count] = device_ptr; @@ -255,7 +259,11 @@ void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_pt } if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers) + { + cerr << "external CUDA buffer registry exhausted at " + << ExternalBufferRegistry::max_mapped_buffers << " entries" << endl; return; + } registry.host_buffers[registry.mapped_buffer_count] = host_ptr; registry.device_buffers[registry.mapped_buffer_count] = device_ptr; @@ -421,6 +429,7 @@ void ensure_host_buffer_registered(const double *host_ptr, size_t bytes) return; } + cerr << "cudaHostRegister failed: " << cudaGetErrorString(err) << endl; registry.failed[slot] = true; registry.capacities[slot] = bytes; } @@ -932,6 +941,25 @@ void bssn_gpu_clear_cached_device_buffers() invalidate_owned_buffer_map(owned_buffer_registry()); } +void bssn_gpu_release_pinned_host_buffers() +{ + PinnedHostRegistry &pinned = pinned_host_registry(); + for (int i = 0; i < pinned.buffer_count; ++i) + { + if (pinned.registered[i] && pinned.host_buffers[i]) + { + cudaError_t unreg_err = cudaHostUnregister(const_cast(pinned.host_buffers[i])); + if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered) + cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl; + } + pinned.host_buffers[i] = nullptr; + pinned.capacities[i] = 0; + pinned.registered[i] = false; + pinned.failed[i] = false; + } + pinned.buffer_count = 0; +} + void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr) { map_external_buffer(external_buffer_registry(), host_ptr, device_ptr); diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index bb1e50e..5a3337d 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -67,6 +67,7 @@ int gpu_rhs_ss(RHS_SS_PARA); int bssn_gpu_bind_process_device(int mpi_rank); void bssn_gpu_clear_cached_device_buffers(); +void bssn_gpu_release_pinned_host_buffers(); const double *bssn_gpu_find_device_buffer(const double *host_ptr); void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr); void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count); diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C index 4354866..be569b0 100644 --- a/AMSS_NCKU_source/bssn_rhs_c.C +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -1022,9 +1022,16 @@ int f_compute_rhs_bssn(int *ex, double &T, + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); #if (GAUGE == 2) - reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 ); + { + const double chi_sqrt = sqrt(chin1[i]); + const double damping = ONE - chi_sqrt; + reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping); + } #else - reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 ); + { + const double damping = ONE - chin1[i]; + reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping); + } #endif dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i]; @@ -1040,9 +1047,16 @@ int f_compute_rhs_bssn(int *ex, double &T, + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] ); #if (GAUGE == 4) - reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 ); + { + const double chi_sqrt = sqrt(chin1[i]); + const double damping = ONE - chi_sqrt; + reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping); + } #else - reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 ); + { + const double damping = ONE - chin1[i]; + reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping); + } #endif betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i]; diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C index 6e60f68..3f46095 100644 --- a/AMSS_NCKU_source/cgh.C +++ b/AMSS_NCKU_source/cgh.C @@ -23,10 +23,13 @@ using namespace std; #include #include "macrodef.h" -#include "misc.h" -#include "cgh.h" -#include "Parallel.h" -#include "parameters.h" +#include "misc.h" +#include "cgh.h" +#include "Parallel.h" +#include "parameters.h" +#ifdef USE_GPU +#include "bssn_gpu.h" +#endif //================================================================================================ @@ -881,13 +884,17 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag, tmPat = construct_patchlist(lev, Symmetry); // tmPat construction completes Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); - // checkPatchList(tmPat,true); - bool CC = (lev > trfls); - Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); - - Parallel::KillBlocks(PatL[lev]); - PatL[lev]->destroyList(); - PatL[lev] = tmPat; + // checkPatchList(tmPat,true); + bool CC = (lev > trfls); + Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); + +#ifdef USE_GPU + bssn_gpu_clear_cached_device_buffers(); + bssn_gpu_release_pinned_host_buffers(); +#endif + Parallel::KillBlocks(PatL[lev]); + PatL[lev]->destroyList(); + PatL[lev] = tmPat; #if (RPB == 1) Parallel::destroypsuList_bam(bdsul[lev]); Parallel::destroypsuList_bam(rsul[lev]); @@ -910,13 +917,17 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag, tmPat = construct_patchlist(lev, Symmetry); // tmPat construction completes Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); - // checkPatchList(tmPat,true); - bool CC = (lev > trfls); - Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); - - Parallel::KillBlocks(PatL[lev]); - PatL[lev]->destroyList(); - PatL[lev] = tmPat; + // checkPatchList(tmPat,true); + bool CC = (lev > trfls); + Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); + +#ifdef USE_GPU + bssn_gpu_clear_cached_device_buffers(); + bssn_gpu_release_pinned_host_buffers(); +#endif + Parallel::KillBlocks(PatL[lev]); + PatL[lev]->destroyList(); + PatL[lev] = tmPat; #if (RPB == 1) #error "not support yet" #endif @@ -1518,13 +1529,17 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev, tmPat = construct_patchlist(lev, Symmetry); // tmPat construction completes Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false); - // checkPatchList(tmPat,true); - bool CC = (lev > trfls); - Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); - - Parallel::KillBlocks(PatL[lev]); - PatL[lev]->destroyList(); - PatL[lev] = tmPat; + // checkPatchList(tmPat,true); + bool CC = (lev > trfls); + Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); + +#ifdef USE_GPU + bssn_gpu_clear_cached_device_buffers(); + bssn_gpu_release_pinned_host_buffers(); +#endif + Parallel::KillBlocks(PatL[lev]); + PatL[lev]->destroyList(); + PatL[lev] = tmPat; } #elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #warning "recompose_cgh_Onelevel is not implimented yet" @@ -1540,14 +1555,18 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev, // tmPat construction completes Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]); misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute"); - // checkPatchList(tmPat,true); - bool CC = (lev > trfls); - Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); - misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data"); - - Parallel::KillBlocks(PatL[lev]); - PatL[lev]->destroyList(); - PatL[lev] = tmPat; + // checkPatchList(tmPat,true); + bool CC = (lev > trfls); + Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC); + misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data"); + +#ifdef USE_GPU + bssn_gpu_clear_cached_device_buffers(); + bssn_gpu_release_pinned_host_buffers(); +#endif + Parallel::KillBlocks(PatL[lev]); + PatL[lev]->destroyList(); + PatL[lev] = tmPat; }