Stabilize GPU buffer lifecycle around regrid
This commit is contained in:
@@ -11,6 +11,9 @@ using namespace std;
|
|||||||
|
|
||||||
#include "Block.h"
|
#include "Block.h"
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
|
#ifdef USE_GPU
|
||||||
|
#include "bssn_gpu.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
|
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
|
||||||
{
|
{
|
||||||
@@ -101,6 +104,9 @@ Block::~Block()
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
if (myrank == rank)
|
if (myrank == rank)
|
||||||
{
|
{
|
||||||
|
#ifdef USE_GPU
|
||||||
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
#endif
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
delete[] X[i];
|
delete[] X[i];
|
||||||
for (int i = 0; i < ingfs; i++)
|
for (int i = 0; i < ingfs; i++)
|
||||||
|
|||||||
@@ -749,6 +749,7 @@ void bssn_class::Initialize()
|
|||||||
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
||||||
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
|
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
|
||||||
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
|
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
|
||||||
|
sync_cache_psi4 = new Parallel::SyncCache[GH->levels];
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -1025,6 +1026,24 @@ bssn_class::~bssn_class()
|
|||||||
sync_cache_rp_fine[i].destroy();
|
sync_cache_rp_fine[i].destroy();
|
||||||
delete[] sync_cache_rp_fine;
|
delete[] sync_cache_rp_fine;
|
||||||
}
|
}
|
||||||
|
if (sync_cache_restrict)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < GH->levels; i++)
|
||||||
|
sync_cache_restrict[i].destroy();
|
||||||
|
delete[] sync_cache_restrict;
|
||||||
|
}
|
||||||
|
if (sync_cache_outbd)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < GH->levels; i++)
|
||||||
|
sync_cache_outbd[i].destroy();
|
||||||
|
delete[] sync_cache_outbd;
|
||||||
|
}
|
||||||
|
if (sync_cache_psi4)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < GH->levels; i++)
|
||||||
|
sync_cache_psi4[i].destroy();
|
||||||
|
delete[] sync_cache_psi4;
|
||||||
|
}
|
||||||
|
|
||||||
delete GH;
|
delete GH;
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
@@ -1060,6 +1079,23 @@ bssn_class::~bssn_class()
|
|||||||
delete CheckPoint;
|
delete CheckPoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bssn_class::InvalidateSyncCaches()
|
||||||
|
{
|
||||||
|
if (!GH)
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (int il = 0; il < GH->levels; il++)
|
||||||
|
{
|
||||||
|
sync_cache_pre[il].invalidate();
|
||||||
|
sync_cache_cor[il].invalidate();
|
||||||
|
sync_cache_rp_coarse[il].invalidate();
|
||||||
|
sync_cache_rp_fine[il].invalidate();
|
||||||
|
sync_cache_restrict[il].invalidate();
|
||||||
|
sync_cache_outbd[il].invalidate();
|
||||||
|
sync_cache_psi4[il].invalidate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
|
|
||||||
|
|
||||||
@@ -2229,7 +2265,7 @@ void bssn_class::Evolve(int Steps)
|
|||||||
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
||||||
@@ -2450,7 +2486,7 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2629,7 +2665,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2796,7 +2832,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2811,7 +2847,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2830,7 +2866,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2846,7 +2882,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
InvalidateSyncCaches();
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -6262,7 +6298,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
|
|||||||
for(int ilev=GH->levels-1;ilev>lev;ilev--)
|
for(int ilev=GH->levels-1;ilev>lev;ilev--)
|
||||||
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
|
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
|
||||||
#else
|
#else
|
||||||
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4");
|
Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
|
|||||||
@@ -132,6 +132,7 @@ public:
|
|||||||
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
|
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
|
||||||
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
|
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
|
||||||
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
|
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
|
||||||
|
Parallel::SyncCache *sync_cache_psi4; // cached Psi4 sync on PatL[lev]
|
||||||
|
|
||||||
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
||||||
monitor *ConVMonitor;
|
monitor *ConVMonitor;
|
||||||
@@ -176,6 +177,7 @@ public:
|
|||||||
virtual void Initialize();
|
virtual void Initialize();
|
||||||
virtual void Read_Ansorg();
|
virtual void Read_Ansorg();
|
||||||
virtual void Read_Pablo() {};
|
virtual void Read_Pablo() {};
|
||||||
|
void InvalidateSyncCaches();
|
||||||
virtual void Compute_Psi4(int lev);
|
virtual void Compute_Psi4(int lev);
|
||||||
virtual void Step(int lev, int YN);
|
virtual void Step(int lev, int YN);
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
|
|||||||
@@ -1201,9 +1201,9 @@ int bssn_cuda_prolong3_pack(int wei,
|
|||||||
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
// The current input runs with equatorial symmetry enabled.
|
// The symmetry-aware prolong CUDA path is still not equivalent to the
|
||||||
// The symmetry-aware prolong CUDA path is not numerically stable yet,
|
// active Cell/ghost_width=3 Fortran implementation, so keep the safe
|
||||||
// so force a safe fallback to the original Fortran implementation.
|
// fallback for all symmetry-enabled cases.
|
||||||
if (symmetry != 0)
|
if (symmetry != 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
@@ -1276,7 +1276,6 @@ int bssn_cuda_prolong3_pack(int wei,
|
|||||||
|
|
||||||
// Current CUDA prolong path only supports the same fast path as the
|
// Current CUDA prolong path only supports the same fast path as the
|
||||||
// optimized Fortran code: interior stencil access without symmetry_bd().
|
// optimized Fortran code: interior stencil access without symmetry_bd().
|
||||||
// If the stencil touches the symmetry boundary, fall back to Fortran.
|
|
||||||
if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
|
if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
|||||||
@@ -135,7 +135,7 @@ struct GpuRhsCache
|
|||||||
const double *last_y = nullptr;
|
const double *last_y = nullptr;
|
||||||
const double *last_z = nullptr;
|
const double *last_z = nullptr;
|
||||||
bool meta_uploaded = false;
|
bool meta_uploaded = false;
|
||||||
static const int max_mapped_buffers = 128;
|
static const int max_mapped_buffers = 512;
|
||||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||||
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||||
int mapped_buffer_count = 0;
|
int mapped_buffer_count = 0;
|
||||||
@@ -143,7 +143,7 @@ struct GpuRhsCache
|
|||||||
|
|
||||||
struct ExternalBufferRegistry
|
struct ExternalBufferRegistry
|
||||||
{
|
{
|
||||||
static const int max_mapped_buffers = 256;
|
static const int max_mapped_buffers = 4096;
|
||||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||||
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||||
int mapped_buffer_count = 0;
|
int mapped_buffer_count = 0;
|
||||||
@@ -151,7 +151,7 @@ struct ExternalBufferRegistry
|
|||||||
|
|
||||||
struct OwnedBufferRegistry
|
struct OwnedBufferRegistry
|
||||||
{
|
{
|
||||||
static const int max_mapped_buffers = 256;
|
static const int max_mapped_buffers = 4096;
|
||||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||||
double *device_buffers[max_mapped_buffers] = {nullptr};
|
double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||||
size_t capacities[max_mapped_buffers] = {0};
|
size_t capacities[max_mapped_buffers] = {0};
|
||||||
@@ -223,7 +223,11 @@ void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
|
if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
|
||||||
|
{
|
||||||
|
cerr << "gpu RHS buffer registry exhausted at " << GpuRhsCache::max_mapped_buffers
|
||||||
|
<< " entries" << endl;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
|
cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
|
||||||
cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
|
cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
|
||||||
@@ -255,7 +259,11 @@ void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_pt
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
|
if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
|
||||||
|
{
|
||||||
|
cerr << "external CUDA buffer registry exhausted at "
|
||||||
|
<< ExternalBufferRegistry::max_mapped_buffers << " entries" << endl;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
|
registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
|
||||||
registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
|
registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
|
||||||
@@ -421,6 +429,7 @@ void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cerr << "cudaHostRegister failed: " << cudaGetErrorString(err) << endl;
|
||||||
registry.failed[slot] = true;
|
registry.failed[slot] = true;
|
||||||
registry.capacities[slot] = bytes;
|
registry.capacities[slot] = bytes;
|
||||||
}
|
}
|
||||||
@@ -932,6 +941,25 @@ void bssn_gpu_clear_cached_device_buffers()
|
|||||||
invalidate_owned_buffer_map(owned_buffer_registry());
|
invalidate_owned_buffer_map(owned_buffer_registry());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bssn_gpu_release_pinned_host_buffers()
|
||||||
|
{
|
||||||
|
PinnedHostRegistry &pinned = pinned_host_registry();
|
||||||
|
for (int i = 0; i < pinned.buffer_count; ++i)
|
||||||
|
{
|
||||||
|
if (pinned.registered[i] && pinned.host_buffers[i])
|
||||||
|
{
|
||||||
|
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
|
||||||
|
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
|
||||||
|
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
|
||||||
|
}
|
||||||
|
pinned.host_buffers[i] = nullptr;
|
||||||
|
pinned.capacities[i] = 0;
|
||||||
|
pinned.registered[i] = false;
|
||||||
|
pinned.failed[i] = false;
|
||||||
|
}
|
||||||
|
pinned.buffer_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
||||||
{
|
{
|
||||||
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ int gpu_rhs_ss(RHS_SS_PARA);
|
|||||||
|
|
||||||
int bssn_gpu_bind_process_device(int mpi_rank);
|
int bssn_gpu_bind_process_device(int mpi_rank);
|
||||||
void bssn_gpu_clear_cached_device_buffers();
|
void bssn_gpu_clear_cached_device_buffers();
|
||||||
|
void bssn_gpu_release_pinned_host_buffers();
|
||||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
||||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
||||||
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
|
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
|
||||||
|
|||||||
@@ -1022,9 +1022,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
||||||
|
|
||||||
#if (GAUGE == 2)
|
#if (GAUGE == 2)
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
{
|
||||||
|
const double chi_sqrt = sqrt(chin1[i]);
|
||||||
|
const double damping = ONE - chi_sqrt;
|
||||||
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
|
{
|
||||||
|
const double damping = ONE - chin1[i];
|
||||||
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
|
dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
|
||||||
@@ -1040,9 +1047,16 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
||||||
|
|
||||||
#if (GAUGE == 4)
|
#if (GAUGE == 4)
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
{
|
||||||
|
const double chi_sqrt = sqrt(chin1[i]);
|
||||||
|
const double damping = ONE - chi_sqrt;
|
||||||
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
|
{
|
||||||
|
const double damping = ONE - chin1[i];
|
||||||
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
|
betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
|
||||||
|
|||||||
@@ -27,6 +27,9 @@ using namespace std;
|
|||||||
#include "cgh.h"
|
#include "cgh.h"
|
||||||
#include "Parallel.h"
|
#include "Parallel.h"
|
||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
|
#ifdef USE_GPU
|
||||||
|
#include "bssn_gpu.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
|
|
||||||
@@ -885,6 +888,10 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
|
|||||||
bool CC = (lev > trfls);
|
bool CC = (lev > trfls);
|
||||||
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
PatL[lev] = tmPat;
|
PatL[lev] = tmPat;
|
||||||
@@ -914,6 +921,10 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
|
|||||||
bool CC = (lev > trfls);
|
bool CC = (lev > trfls);
|
||||||
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
PatL[lev] = tmPat;
|
PatL[lev] = tmPat;
|
||||||
@@ -1522,6 +1533,10 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
|
|||||||
bool CC = (lev > trfls);
|
bool CC = (lev > trfls);
|
||||||
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
PatL[lev] = tmPat;
|
PatL[lev] = tmPat;
|
||||||
@@ -1545,6 +1560,10 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
|
|||||||
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
|
||||||
misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
|
misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
PatL[lev] = tmPat;
|
PatL[lev] = tmPat;
|
||||||
|
|||||||
Reference in New Issue
Block a user