Fix GPU interpolation cache lifetime leaks
This commit is contained in:
@@ -13,6 +13,7 @@ using namespace std;
|
|||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
#include "bssn_gpu.h"
|
#include "bssn_gpu.h"
|
||||||
|
#include "bssn_cuda_ops.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
|
Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
|
||||||
@@ -106,6 +107,8 @@ Block::~Block()
|
|||||||
{
|
{
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
bssn_gpu_clear_cached_device_buffers();
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
|
bssn_cuda_release_rk4_caches();
|
||||||
|
bssn_cuda_release_interp_caches();
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
delete[] X[i];
|
delete[] X[i];
|
||||||
|
|||||||
@@ -79,6 +79,15 @@ struct CachedInterpPlan
|
|||||||
CachedInterpPlan() : nblocks(0) {}
|
CachedInterpPlan() : nblocks(0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct CachedInterpPlanEntry
|
||||||
|
{
|
||||||
|
bool valid;
|
||||||
|
InterpPlanKey key;
|
||||||
|
CachedInterpPlan plan;
|
||||||
|
|
||||||
|
CachedInterpPlanEntry() : valid(false) {}
|
||||||
|
};
|
||||||
|
|
||||||
struct InterpBlockView
|
struct InterpBlockView
|
||||||
{
|
{
|
||||||
Block *bp;
|
Block *bp;
|
||||||
@@ -268,6 +277,23 @@ bool should_try_cuda_interp(int ordn, int num_points, int num_var)
|
|||||||
return num_points * num_var >= 256;
|
return num_points * num_var >= 256;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CachedInterpPlanEntry &interp_plan_cache_entry()
|
||||||
|
{
|
||||||
|
static CachedInterpPlanEntry cache;
|
||||||
|
return cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
|
||||||
|
{
|
||||||
|
return lhs.patch == rhs.patch &&
|
||||||
|
lhs.x == rhs.x &&
|
||||||
|
lhs.y == rhs.y &&
|
||||||
|
lhs.z == rhs.z &&
|
||||||
|
lhs.NN == rhs.NN &&
|
||||||
|
lhs.Symmetry == rhs.Symmetry &&
|
||||||
|
lhs.myrank == rhs.myrank;
|
||||||
|
}
|
||||||
|
|
||||||
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
||||||
int NN, double **XX,
|
int NN, double **XX,
|
||||||
int Symmetry, int myrank,
|
int Symmetry, int myrank,
|
||||||
@@ -276,8 +302,6 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
|||||||
bool report_bounds_here,
|
bool report_bounds_here,
|
||||||
bool allow_missing_points)
|
bool allow_missing_points)
|
||||||
{
|
{
|
||||||
static map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess> cache;
|
|
||||||
|
|
||||||
InterpPlanKey key;
|
InterpPlanKey key;
|
||||||
key.patch = patch;
|
key.patch = patch;
|
||||||
key.x = XX[0];
|
key.x = XX[0];
|
||||||
@@ -287,12 +311,16 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
|||||||
key.Symmetry = Symmetry;
|
key.Symmetry = Symmetry;
|
||||||
key.myrank = myrank;
|
key.myrank = myrank;
|
||||||
|
|
||||||
map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess>::iterator it = cache.find(key);
|
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
|
||||||
if (it != cache.end() && it->second.nblocks == static_cast<int>(block_index.views.size()))
|
if (cache.valid &&
|
||||||
return it->second;
|
same_interp_plan_key(cache.key, key) &&
|
||||||
|
cache.plan.nblocks == static_cast<int>(block_index.views.size()))
|
||||||
|
return cache.plan;
|
||||||
|
|
||||||
CachedInterpPlan &plan = cache[key];
|
cache.valid = true;
|
||||||
plan = CachedInterpPlan();
|
cache.key = key;
|
||||||
|
cache.plan = CachedInterpPlan();
|
||||||
|
CachedInterpPlan &plan = cache.plan;
|
||||||
plan.nblocks = static_cast<int>(block_index.views.size());
|
plan.nblocks = static_cast<int>(block_index.views.size());
|
||||||
plan.owner_rank.assign(NN, -1);
|
plan.owner_rank.assign(NN, -1);
|
||||||
plan.owner_block.assign(NN, -1);
|
plan.owner_block.assign(NN, -1);
|
||||||
@@ -380,6 +408,13 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
|||||||
return plan;
|
return plan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void release_interp_plan_cache_internal()
|
||||||
|
{
|
||||||
|
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
|
||||||
|
cache.valid = false;
|
||||||
|
cache.plan = CachedInterpPlan();
|
||||||
|
}
|
||||||
|
|
||||||
bool run_cuda_interp_for_block(Block *BP,
|
bool run_cuda_interp_for_block(Block *BP,
|
||||||
const vector<InterpVarDesc> &vars,
|
const vector<InterpVarDesc> &vars,
|
||||||
const vector<int> &point_ids,
|
const vector<int> &point_ids,
|
||||||
@@ -487,9 +522,14 @@ void interpolate_owned_points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
void patch_release_interp_plan_cache()
|
||||||
{
|
{
|
||||||
|
release_interp_plan_cache_internal();
|
||||||
|
}
|
||||||
|
|
||||||
|
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||||
|
{
|
||||||
|
|
||||||
int hbuffer_width = buffer_width;
|
int hbuffer_width = buffer_width;
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
#include "var.h"
|
#include "var.h"
|
||||||
#include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
|
#include "macrodef.h" //need dim here; Vertex or Cell; ghost_width
|
||||||
|
|
||||||
class Patch
|
class Patch
|
||||||
{
|
{
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@@ -50,6 +50,8 @@ public:
|
|||||||
double *Shellf, int Symmetry, MPI_Comm Comm_here);
|
double *Shellf, int Symmetry, MPI_Comm Comm_here);
|
||||||
void Find_Maximum(MyList<var> *VarList, double *XX,
|
void Find_Maximum(MyList<var> *VarList, double *XX,
|
||||||
double *Shellf, MPI_Comm Comm_here);
|
double *Shellf, MPI_Comm Comm_here);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* PATCH_H */
|
void patch_release_interp_plan_cache();
|
||||||
|
|
||||||
|
#endif /* PATCH_H */
|
||||||
|
|||||||
@@ -48,6 +48,30 @@ struct CachedIntBuffer
|
|||||||
size_t capacity = 0;
|
size_t capacity = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
inline void release_buffer(CachedBuffer &buffer)
|
||||||
|
{
|
||||||
|
if (buffer.ptr)
|
||||||
|
{
|
||||||
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||||
|
if (free_err != cudaSuccess)
|
||||||
|
report_cuda_error("cudaFree", free_err);
|
||||||
|
buffer.ptr = nullptr;
|
||||||
|
}
|
||||||
|
buffer.capacity = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void release_buffer(CachedIntBuffer &buffer)
|
||||||
|
{
|
||||||
|
if (buffer.ptr)
|
||||||
|
{
|
||||||
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||||
|
if (free_err != cudaSuccess)
|
||||||
|
report_cuda_error("cudaFree", free_err);
|
||||||
|
buffer.ptr = nullptr;
|
||||||
|
}
|
||||||
|
buffer.capacity = 0;
|
||||||
|
}
|
||||||
|
|
||||||
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||||
{
|
{
|
||||||
if (bytes <= buffer.capacity && buffer.ptr)
|
if (bytes <= buffer.capacity && buffer.ptr)
|
||||||
@@ -98,6 +122,95 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct Rk4VarCache
|
||||||
|
{
|
||||||
|
CachedBuffer X, Y, Z;
|
||||||
|
CachedBuffer state0, boundary, stage, rhs;
|
||||||
|
const double *host_X = nullptr;
|
||||||
|
const double *host_Y = nullptr;
|
||||||
|
const double *host_Z = nullptr;
|
||||||
|
const double *host_state0 = nullptr;
|
||||||
|
double *host_rhs = nullptr;
|
||||||
|
int nx = 0;
|
||||||
|
int ny = 0;
|
||||||
|
int nz = 0;
|
||||||
|
bool rhs_resident = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct InterpStencilCacheEntry
|
||||||
|
{
|
||||||
|
const double *X = nullptr;
|
||||||
|
const double *Y = nullptr;
|
||||||
|
const double *Z = nullptr;
|
||||||
|
const double *px = nullptr;
|
||||||
|
const double *py = nullptr;
|
||||||
|
const double *pz = nullptr;
|
||||||
|
int nx = 0;
|
||||||
|
int ny = 0;
|
||||||
|
int nz = 0;
|
||||||
|
int num_points = 0;
|
||||||
|
int ordn = 0;
|
||||||
|
int symmetry = 0;
|
||||||
|
bool valid = false;
|
||||||
|
CachedBuffer weights;
|
||||||
|
CachedIntBuffer indices;
|
||||||
|
CachedIntBuffer reflect;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct InterpBatchCache
|
||||||
|
{
|
||||||
|
CachedBuffer out;
|
||||||
|
CachedBuffer soa;
|
||||||
|
CachedBuffer field_ptrs;
|
||||||
|
CachedIntBuffer error_flag;
|
||||||
|
std::vector<CachedBuffer> host_field_copies;
|
||||||
|
InterpStencilCacheEntry stencil_entry;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
|
||||||
|
{
|
||||||
|
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
||||||
|
return cache_map;
|
||||||
|
}
|
||||||
|
|
||||||
|
InterpBatchCache &interp_batch_cache()
|
||||||
|
{
|
||||||
|
static thread_local InterpBatchCache cache;
|
||||||
|
return cache;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
|
||||||
|
{
|
||||||
|
release_buffer(entry.weights);
|
||||||
|
release_buffer(entry.indices);
|
||||||
|
release_buffer(entry.reflect);
|
||||||
|
entry.X = nullptr;
|
||||||
|
entry.Y = nullptr;
|
||||||
|
entry.Z = nullptr;
|
||||||
|
entry.px = nullptr;
|
||||||
|
entry.py = nullptr;
|
||||||
|
entry.pz = nullptr;
|
||||||
|
entry.nx = 0;
|
||||||
|
entry.ny = 0;
|
||||||
|
entry.nz = 0;
|
||||||
|
entry.num_points = 0;
|
||||||
|
entry.ordn = 0;
|
||||||
|
entry.symmetry = 0;
|
||||||
|
entry.valid = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void release_interp_batch_cache(InterpBatchCache &cache)
|
||||||
|
{
|
||||||
|
release_buffer(cache.out);
|
||||||
|
release_buffer(cache.soa);
|
||||||
|
release_buffer(cache.field_ptrs);
|
||||||
|
release_buffer(cache.error_flag);
|
||||||
|
for (size_t i = 0; i < cache.host_field_copies.size(); ++i)
|
||||||
|
release_buffer(cache.host_field_copies[i]);
|
||||||
|
cache.host_field_copies.clear();
|
||||||
|
release_interp_stencil_cache(cache.stencil_entry);
|
||||||
|
}
|
||||||
|
|
||||||
inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
|
inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
|
||||||
{
|
{
|
||||||
if (!ensure_capacity(dst, bytes))
|
if (!ensure_capacity(dst, bytes))
|
||||||
@@ -731,22 +844,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
int rk_stage,
|
int rk_stage,
|
||||||
bool download_to_host)
|
bool download_to_host)
|
||||||
{
|
{
|
||||||
struct Rk4VarCache
|
Rk4VarCache &cache = rk4_var_cache_map()[state0];
|
||||||
{
|
|
||||||
CachedBuffer X, Y, Z;
|
|
||||||
CachedBuffer state0, boundary, stage, rhs;
|
|
||||||
const double *host_X = nullptr;
|
|
||||||
const double *host_Y = nullptr;
|
|
||||||
const double *host_Z = nullptr;
|
|
||||||
const double *host_state0 = nullptr;
|
|
||||||
double *host_rhs = nullptr;
|
|
||||||
int nx = 0;
|
|
||||||
int ny = 0;
|
|
||||||
int nz = 0;
|
|
||||||
bool rhs_resident = false;
|
|
||||||
};
|
|
||||||
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
|
||||||
Rk4VarCache &cache = cache_map[state0];
|
|
||||||
|
|
||||||
int nx = ex[0];
|
int nx = ex[0];
|
||||||
int ny = ex[1];
|
int ny = ex[1];
|
||||||
@@ -909,6 +1007,29 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
return ok ? 0 : 1;
|
return ok ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bssn_cuda_release_rk4_caches()
|
||||||
|
{
|
||||||
|
std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
|
||||||
|
for (std::unordered_map<const double *, Rk4VarCache>::iterator it = cache_map.begin();
|
||||||
|
it != cache_map.end(); ++it)
|
||||||
|
{
|
||||||
|
Rk4VarCache &cache = it->second;
|
||||||
|
release_buffer(cache.X);
|
||||||
|
release_buffer(cache.Y);
|
||||||
|
release_buffer(cache.Z);
|
||||||
|
release_buffer(cache.state0);
|
||||||
|
release_buffer(cache.boundary);
|
||||||
|
release_buffer(cache.stage);
|
||||||
|
release_buffer(cache.rhs);
|
||||||
|
}
|
||||||
|
cache_map.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void bssn_cuda_release_interp_caches()
|
||||||
|
{
|
||||||
|
release_interp_batch_cache(interp_batch_cache());
|
||||||
|
}
|
||||||
|
|
||||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
||||||
{
|
{
|
||||||
static thread_local CachedBuffer d_chi;
|
static thread_local CachedBuffer d_chi;
|
||||||
@@ -988,40 +1109,7 @@ int bssn_cuda_interp_points_batch(const int *ex,
|
|||||||
if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
struct InterpBatchCache
|
InterpBatchCache &cache = interp_batch_cache();
|
||||||
{
|
|
||||||
struct StencilCacheEntry
|
|
||||||
{
|
|
||||||
const double *X;
|
|
||||||
const double *Y;
|
|
||||||
const double *Z;
|
|
||||||
const double *px;
|
|
||||||
const double *py;
|
|
||||||
const double *pz;
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
int nz;
|
|
||||||
int num_points;
|
|
||||||
int ordn;
|
|
||||||
int symmetry;
|
|
||||||
CachedBuffer weights;
|
|
||||||
CachedIntBuffer indices;
|
|
||||||
CachedIntBuffer reflect;
|
|
||||||
|
|
||||||
StencilCacheEntry()
|
|
||||||
: X(nullptr), Y(nullptr), Z(nullptr),
|
|
||||||
px(nullptr), py(nullptr), pz(nullptr),
|
|
||||||
nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
CachedBuffer out;
|
|
||||||
CachedBuffer soa;
|
|
||||||
CachedBuffer field_ptrs;
|
|
||||||
CachedIntBuffer error_flag;
|
|
||||||
std::vector<CachedBuffer> host_field_copies;
|
|
||||||
std::vector<StencilCacheEntry> stencil_entries;
|
|
||||||
};
|
|
||||||
static thread_local InterpBatchCache cache;
|
|
||||||
|
|
||||||
const int nx = ex[0];
|
const int nx = ex[0];
|
||||||
const int ny = ex[1];
|
const int ny = ex[1];
|
||||||
@@ -1037,37 +1125,31 @@ int bssn_cuda_interp_points_batch(const int *ex,
|
|||||||
const size_t indices_bytes = point_stencil_ints * sizeof(int);
|
const size_t indices_bytes = point_stencil_ints * sizeof(int);
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr;
|
InterpStencilCacheEntry &stencil_cache = cache.stencil_entry;
|
||||||
for (size_t i = 0; i < cache.stencil_entries.size(); ++i)
|
const bool stencil_match =
|
||||||
{
|
stencil_cache.valid &&
|
||||||
InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i];
|
stencil_cache.X == X && stencil_cache.Y == Y && stencil_cache.Z == Z &&
|
||||||
if (entry.X == X && entry.Y == Y && entry.Z == Z &&
|
stencil_cache.px == px && stencil_cache.py == py && stencil_cache.pz == pz &&
|
||||||
entry.px == px && entry.py == py && entry.pz == pz &&
|
stencil_cache.nx == nx && stencil_cache.ny == ny && stencil_cache.nz == nz &&
|
||||||
entry.nx == nx && entry.ny == ny && entry.nz == nz &&
|
stencil_cache.num_points == num_points && stencil_cache.ordn == ordn &&
|
||||||
entry.num_points == num_points && entry.ordn == ordn &&
|
stencil_cache.symmetry == symmetry;
|
||||||
entry.symmetry == symmetry)
|
|
||||||
{
|
|
||||||
stencil_cache = &entry;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!stencil_cache)
|
if (!stencil_match)
|
||||||
{
|
{
|
||||||
cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry());
|
release_interp_stencil_cache(stencil_cache);
|
||||||
stencil_cache = &cache.stencil_entries.back();
|
stencil_cache.X = X;
|
||||||
stencil_cache->X = X;
|
stencil_cache.Y = Y;
|
||||||
stencil_cache->Y = Y;
|
stencil_cache.Z = Z;
|
||||||
stencil_cache->Z = Z;
|
stencil_cache.px = px;
|
||||||
stencil_cache->px = px;
|
stencil_cache.py = py;
|
||||||
stencil_cache->py = py;
|
stencil_cache.pz = pz;
|
||||||
stencil_cache->pz = pz;
|
stencil_cache.nx = nx;
|
||||||
stencil_cache->nx = nx;
|
stencil_cache.ny = ny;
|
||||||
stencil_cache->ny = ny;
|
stencil_cache.nz = nz;
|
||||||
stencil_cache->nz = nz;
|
stencil_cache.num_points = num_points;
|
||||||
stencil_cache->num_points = num_points;
|
stencil_cache.ordn = ordn;
|
||||||
stencil_cache->ordn = ordn;
|
stencil_cache.symmetry = symmetry;
|
||||||
stencil_cache->symmetry = symmetry;
|
stencil_cache.valid = true;
|
||||||
|
|
||||||
std::vector<double> host_weights(point_stencil_doubles);
|
std::vector<double> host_weights(point_stencil_doubles);
|
||||||
std::vector<int> host_indices(point_stencil_ints);
|
std::vector<int> host_indices(point_stencil_ints);
|
||||||
@@ -1104,9 +1186,9 @@ int bssn_cuda_interp_points_batch(const int *ex,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ok = ok &&
|
ok = ok &&
|
||||||
copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) &&
|
copy_to_device(stencil_cache.weights, host_weights.data(), weights_bytes) &&
|
||||||
copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) &&
|
copy_to_device(stencil_cache.indices, host_indices.data(), indices_bytes) &&
|
||||||
copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes);
|
copy_to_device(stencil_cache.reflect, host_reflect.data(), indices_bytes);
|
||||||
if (!ok)
|
if (!ok)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -1159,9 +1241,9 @@ int bssn_cuda_interp_points_batch(const int *ex,
|
|||||||
int ny_local = ny;
|
int ny_local = ny;
|
||||||
const double *dsoa = cache.soa.ptr;
|
const double *dsoa = cache.soa.ptr;
|
||||||
const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
|
const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
|
||||||
const double *dweights = stencil_cache->weights.ptr;
|
const double *dweights = stencil_cache.weights.ptr;
|
||||||
const int *dindices = stencil_cache->indices.ptr;
|
const int *dindices = stencil_cache.indices.ptr;
|
||||||
const int *dreflect = stencil_cache->reflect.ptr;
|
const int *dreflect = stencil_cache.reflect.ptr;
|
||||||
double *dout = cache.out.ptr;
|
double *dout = cache.out.ptr;
|
||||||
int *derror = cache.error_flag.ptr;
|
int *derror = cache.error_flag.ptr;
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
|
|
||||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
|
||||||
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
|
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
|
||||||
|
void bssn_cuda_release_rk4_caches();
|
||||||
|
void bssn_cuda_release_interp_caches();
|
||||||
|
|
||||||
int bssn_cuda_prolong3_pack(int wei,
|
int bssn_cuda_prolong3_pack(int wei,
|
||||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ using namespace std;
|
|||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
#include "bssn_gpu.h"
|
#include "bssn_gpu.h"
|
||||||
|
#include "bssn_cuda_ops.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -891,6 +892,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
|
|||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
bssn_gpu_clear_cached_device_buffers();
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
bssn_gpu_release_pinned_host_buffers();
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
bssn_cuda_release_rk4_caches();
|
||||||
|
bssn_cuda_release_interp_caches();
|
||||||
|
patch_release_interp_plan_cache();
|
||||||
#endif
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
@@ -924,6 +928,9 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
|
|||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
bssn_gpu_clear_cached_device_buffers();
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
bssn_gpu_release_pinned_host_buffers();
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
bssn_cuda_release_rk4_caches();
|
||||||
|
bssn_cuda_release_interp_caches();
|
||||||
|
patch_release_interp_plan_cache();
|
||||||
#endif
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
@@ -1536,6 +1543,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
|
|||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
bssn_gpu_clear_cached_device_buffers();
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
bssn_gpu_release_pinned_host_buffers();
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
bssn_cuda_release_rk4_caches();
|
||||||
|
bssn_cuda_release_interp_caches();
|
||||||
|
patch_release_interp_plan_cache();
|
||||||
#endif
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
@@ -1563,6 +1573,9 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
|
|||||||
#ifdef USE_GPU
|
#ifdef USE_GPU
|
||||||
bssn_gpu_clear_cached_device_buffers();
|
bssn_gpu_clear_cached_device_buffers();
|
||||||
bssn_gpu_release_pinned_host_buffers();
|
bssn_gpu_release_pinned_host_buffers();
|
||||||
|
bssn_cuda_release_rk4_caches();
|
||||||
|
bssn_cuda_release_interp_caches();
|
||||||
|
patch_release_interp_plan_cache();
|
||||||
#endif
|
#endif
|
||||||
Parallel::KillBlocks(PatL[lev]);
|
Parallel::KillBlocks(PatL[lev]);
|
||||||
PatL[lev]->destroyList();
|
PatL[lev]->destroyList();
|
||||||
|
|||||||
Reference in New Issue
Block a user