Cache GPU main-path device buffers
This commit is contained in:
@@ -33,15 +33,43 @@ struct DeviceArrays
|
||||
double *d = nullptr;
|
||||
};
|
||||
|
||||
inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
||||
struct CachedBuffer
|
||||
{
|
||||
cudaError_t err = cudaMalloc(&dst, bytes);
|
||||
double *ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
return true;
|
||||
|
||||
if (buffer.ptr)
|
||||
{
|
||||
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||
if (free_err != cudaSuccess)
|
||||
report_cuda_error("cudaFree", free_err);
|
||||
buffer.ptr = nullptr;
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMalloc", err);
|
||||
return false;
|
||||
}
|
||||
err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
|
||||
|
||||
buffer.capacity = bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
||||
{
|
||||
if (!ensure_capacity(dst, bytes))
|
||||
return false;
|
||||
|
||||
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D)", err);
|
||||
@@ -50,12 +78,6 @@ inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void free_device(double *ptr)
|
||||
{
|
||||
if (ptr)
|
||||
cudaFree(ptr);
|
||||
}
|
||||
|
||||
__global__ void enforce_ga_kernel(int n,
|
||||
double *dxx, double *gxy, double *gxz,
|
||||
double *dyy, double *gyz, double *dzz,
|
||||
@@ -376,31 +398,37 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
double *Axx, double *Axy, double *Axz,
|
||||
double *Ayy, double *Ayz, double *Azz)
|
||||
{
|
||||
struct EnforceGaCache
|
||||
{
|
||||
CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
|
||||
CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
|
||||
};
|
||||
static thread_local EnforceGaCache cache;
|
||||
|
||||
int n = count_points(ex);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
|
||||
double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
|
||||
double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
|
||||
double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
|
||||
|
||||
bool ok = copy_to_device(d_dxx, dxx, bytes) &&
|
||||
copy_to_device(d_gxy, gxy, bytes) &&
|
||||
copy_to_device(d_gxz, gxz, bytes) &&
|
||||
copy_to_device(d_dyy, dyy, bytes) &&
|
||||
copy_to_device(d_gyz, gyz, bytes) &&
|
||||
copy_to_device(d_dzz, dzz, bytes) &&
|
||||
copy_to_device(d_Axx, Axx, bytes) &&
|
||||
copy_to_device(d_Axy, Axy, bytes) &&
|
||||
copy_to_device(d_Axz, Axz, bytes) &&
|
||||
copy_to_device(d_Ayy, Ayy, bytes) &&
|
||||
copy_to_device(d_Ayz, Ayz, bytes) &&
|
||||
copy_to_device(d_Azz, Azz, bytes);
|
||||
bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
|
||||
copy_to_device(cache.gxy, gxy, bytes) &&
|
||||
copy_to_device(cache.gxz, gxz, bytes) &&
|
||||
copy_to_device(cache.dyy, dyy, bytes) &&
|
||||
copy_to_device(cache.gyz, gyz, bytes) &&
|
||||
copy_to_device(cache.dzz, dzz, bytes) &&
|
||||
copy_to_device(cache.Axx, Axx, bytes) &&
|
||||
copy_to_device(cache.Axy, Axy, bytes) &&
|
||||
copy_to_device(cache.Axz, Axz, bytes) &&
|
||||
copy_to_device(cache.Ayy, Ayy, bytes) &&
|
||||
copy_to_device(cache.Ayz, Ayz, bytes) &&
|
||||
copy_to_device(cache.Azz, Azz, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
|
||||
double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
|
||||
double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
|
||||
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
|
||||
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
||||
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
||||
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
|
||||
@@ -408,27 +436,22 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
||||
if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
||||
}
|
||||
|
||||
free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
|
||||
free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
|
||||
free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
|
||||
free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -446,6 +469,19 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
int lev,
|
||||
int rk_stage)
|
||||
{
|
||||
struct Rk4BoundaryCache
|
||||
{
|
||||
CachedBuffer X, Y, Z;
|
||||
CachedBuffer state0, boundary, stage, rhs;
|
||||
const double *host_X = nullptr;
|
||||
const double *host_Y = nullptr;
|
||||
const double *host_Z = nullptr;
|
||||
int nx = 0;
|
||||
int ny = 0;
|
||||
int nz = 0;
|
||||
};
|
||||
static thread_local Rk4BoundaryCache cache;
|
||||
|
||||
int nx = ex[0];
|
||||
int ny = ex[1];
|
||||
int nz = ex[2];
|
||||
@@ -457,23 +493,32 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
|
||||
double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
|
||||
bool ok = true;
|
||||
if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||
cache.nx != nx || cache.ny != ny || cache.nz != nz)
|
||||
{
|
||||
ok = copy_to_device(cache.X, X, bytes_x) &&
|
||||
copy_to_device(cache.Y, Y, bytes_y) &&
|
||||
copy_to_device(cache.Z, Z, bytes_z);
|
||||
if (ok)
|
||||
{
|
||||
cache.host_X = X;
|
||||
cache.host_Y = Y;
|
||||
cache.host_Z = Z;
|
||||
cache.nx = nx;
|
||||
cache.ny = ny;
|
||||
cache.nz = nz;
|
||||
}
|
||||
}
|
||||
|
||||
bool ok = copy_to_device(d_X, X, bytes_x) &&
|
||||
copy_to_device(d_Y, Y, bytes_y) &&
|
||||
copy_to_device(d_Z, Z, bytes_z) &&
|
||||
copy_to_device(d_state0, state0, bytes) &&
|
||||
copy_to_device(d_boundary, boundary_src, bytes) &&
|
||||
copy_to_device(d_stage, stage_data, bytes) &&
|
||||
copy_to_device(d_rhs, rhs_accum, bytes);
|
||||
ok = ok &&
|
||||
copy_to_device(cache.state0, state0, bytes) &&
|
||||
copy_to_device(cache.boundary, boundary_src, bytes) &&
|
||||
copy_to_device(cache.stage, stage_data, bytes) &&
|
||||
copy_to_device(cache.rhs, rhs_accum, bytes);
|
||||
|
||||
if (!ok)
|
||||
{
|
||||
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double dX = X[1] - X[0];
|
||||
double dY = Y[1] - Y[0];
|
||||
@@ -498,6 +543,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
||||
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
||||
|
||||
double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
|
||||
double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
|
||||
double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
|
||||
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
|
||||
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
||||
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
||||
@@ -513,12 +561,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev > 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr;
|
||||
void *args[] = {&nx, &ny, &nz,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
@@ -528,45 +578,43 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (ok)
|
||||
{
|
||||
err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
|
||||
err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
{
|
||||
static thread_local CachedBuffer d_chi;
|
||||
|
||||
int n = count_points(ex);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
double *d_chi = nullptr;
|
||||
bool ok = copy_to_device(d_chi, chi, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
void *args[] = {&n, &d_chi, &tinny};
|
||||
double *ptr = d_chi.ptr;
|
||||
void *args[] = {&n, &ptr, &tinny};
|
||||
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
|
||||
cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
|
||||
free_device(d_chi);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user