Persist GPU RK4 stage caches
This commit is contained in:
@@ -3,6 +3,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@@ -469,18 +470,22 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
int lev,
|
int lev,
|
||||||
int rk_stage)
|
int rk_stage)
|
||||||
{
|
{
|
||||||
struct Rk4BoundaryCache
|
struct Rk4VarCache
|
||||||
{
|
{
|
||||||
CachedBuffer X, Y, Z;
|
CachedBuffer X, Y, Z;
|
||||||
CachedBuffer state0, boundary, stage, rhs;
|
CachedBuffer state0, boundary, stage, rhs;
|
||||||
const double *host_X = nullptr;
|
const double *host_X = nullptr;
|
||||||
const double *host_Y = nullptr;
|
const double *host_Y = nullptr;
|
||||||
const double *host_Z = nullptr;
|
const double *host_Z = nullptr;
|
||||||
|
const double *host_state0 = nullptr;
|
||||||
|
double *host_rhs = nullptr;
|
||||||
int nx = 0;
|
int nx = 0;
|
||||||
int ny = 0;
|
int ny = 0;
|
||||||
int nz = 0;
|
int nz = 0;
|
||||||
|
bool rhs_resident = false;
|
||||||
};
|
};
|
||||||
static thread_local Rk4BoundaryCache cache;
|
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
||||||
|
Rk4VarCache &cache = cache_map[state0];
|
||||||
|
|
||||||
int nx = ex[0];
|
int nx = ex[0];
|
||||||
int ny = ex[1];
|
int ny = ex[1];
|
||||||
@@ -497,8 +502,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
const bool need_coord_copy = need_bam_boundary;
|
const bool need_coord_copy = need_bam_boundary;
|
||||||
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
|
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
|
||||||
const bool need_stage_input = (rk_stage != 0);
|
const bool need_stage_input = (rk_stage != 0);
|
||||||
const bool need_rhs_output = (rk_stage != 3);
|
|
||||||
|
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
if (need_coord_copy &&
|
if (need_coord_copy &&
|
||||||
(cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
(cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||||
@@ -518,11 +521,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const bool refresh_state0 =
|
||||||
|
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
|
||||||
|
const bool refresh_rhs =
|
||||||
|
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
||||||
|
|
||||||
ok = ok &&
|
ok = ok &&
|
||||||
copy_to_device(cache.state0, state0, bytes) &&
|
(!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
|
||||||
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
||||||
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
|
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
|
||||||
copy_to_device(cache.rhs, rhs_accum, bytes);
|
(!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));
|
||||||
|
|
||||||
if (ok && !need_stage_input)
|
if (ok && !need_stage_input)
|
||||||
ok = ensure_capacity(cache.stage, bytes);
|
ok = ensure_capacity(cache.stage, bytes);
|
||||||
@@ -530,6 +538,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
if (!ok)
|
if (!ok)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
if (refresh_state0)
|
||||||
|
cache.host_state0 = state0;
|
||||||
|
if (refresh_rhs)
|
||||||
|
{
|
||||||
|
cache.host_rhs = rhs_accum;
|
||||||
|
cache.rhs_resident = true;
|
||||||
|
}
|
||||||
|
|
||||||
double dX = X[1] - X[0];
|
double dX = X[1] - X[0];
|
||||||
double dY = Y[1] - Y[0];
|
double dY = Y[1] - Y[0];
|
||||||
double dZ = Z[1] - Z[0];
|
double dZ = Z[1] - Z[0];
|
||||||
@@ -591,12 +607,6 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||||
ok = err == cudaSuccess;
|
ok = err == cudaSuccess;
|
||||||
if (ok && need_rhs_output)
|
|
||||||
{
|
|
||||||
err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
|
|
||||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
|
||||||
ok = err == cudaSuccess;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ok ? 0 : 1;
|
return ok ? 0 : 1;
|
||||||
|
|||||||
Reference in New Issue
Block a user