Optimize GPU RK4 stage sync path
This commit is contained in:
@@ -50,6 +50,12 @@ struct CachedIntBuffer
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
struct CachedPtrBuffer
|
||||
{
|
||||
void *ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
inline void release_buffer(CachedBuffer &buffer)
|
||||
{
|
||||
if (buffer.ptr)
|
||||
@@ -74,6 +80,18 @@ inline void release_buffer(CachedIntBuffer &buffer)
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
inline void release_buffer(CachedPtrBuffer &buffer)
|
||||
{
|
||||
if (buffer.ptr)
|
||||
{
|
||||
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||
if (free_err != cudaSuccess)
|
||||
report_cuda_error("cudaFree", free_err);
|
||||
buffer.ptr = nullptr;
|
||||
}
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
@@ -124,6 +142,31 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool ensure_capacity(CachedPtrBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
return true;
|
||||
|
||||
if (buffer.ptr)
|
||||
{
|
||||
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||
if (free_err != cudaSuccess)
|
||||
report_cuda_error("cudaFree", free_err);
|
||||
buffer.ptr = nullptr;
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMalloc", err);
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer.capacity = bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
struct Rk4VarCache
|
||||
{
|
||||
CachedBuffer X, Y, Z;
|
||||
@@ -169,6 +212,13 @@ struct InterpBatchCache
|
||||
InterpStencilCacheEntry stencil_entry;
|
||||
};
|
||||
|
||||
struct Rk4BatchCache
|
||||
{
|
||||
CachedPtrBuffer state0_ptrs;
|
||||
CachedPtrBuffer stage_ptrs;
|
||||
CachedPtrBuffer rhs_ptrs;
|
||||
};
|
||||
|
||||
std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
|
||||
{
|
||||
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
||||
@@ -181,6 +231,12 @@ InterpBatchCache &interp_batch_cache()
|
||||
return cache;
|
||||
}
|
||||
|
||||
Rk4BatchCache &rk4_batch_cache()
|
||||
{
|
||||
static thread_local Rk4BatchCache cache;
|
||||
return cache;
|
||||
}
|
||||
|
||||
inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
|
||||
{
|
||||
release_buffer(entry.weights);
|
||||
@@ -791,6 +847,63 @@ __global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void rk4_boundary_batch_kernel(int n, int nx, int ny, int nz,
|
||||
int has_xmin, int has_ymin, int has_zmin,
|
||||
int has_xmax, int has_ymax, int has_zmax,
|
||||
int num_var, double dT,
|
||||
const double *const *state0_list,
|
||||
double *const *stage_list,
|
||||
double *const *rhs_list,
|
||||
int stage)
|
||||
{
|
||||
const double half = 0.5;
|
||||
const double one_sixth = 1.0 / 6.0;
|
||||
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int plane = nx * ny;
|
||||
const int k = idx / plane;
|
||||
const int rem = idx - k * plane;
|
||||
const int j = rem / nx;
|
||||
const int i = rem - j * nx;
|
||||
const bool is_boundary =
|
||||
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
||||
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
||||
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
||||
|
||||
for (int v = 0; v < num_var; ++v)
|
||||
{
|
||||
const double *f0 = state0_list[v];
|
||||
double *f1 = stage_list[v];
|
||||
double *rhs = rhs_list[v];
|
||||
|
||||
double out;
|
||||
if (stage == 0)
|
||||
{
|
||||
out = f0[idx] + half * dT * rhs[idx];
|
||||
}
|
||||
else if (stage == 1)
|
||||
{
|
||||
rhs[idx] += 2.0 * f1[idx];
|
||||
out = f0[idx] + half * dT * f1[idx];
|
||||
}
|
||||
else if (stage == 2)
|
||||
{
|
||||
rhs[idx] += 2.0 * f1[idx];
|
||||
out = f0[idx] + dT * f1[idx];
|
||||
}
|
||||
else
|
||||
{
|
||||
out = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
||||
}
|
||||
|
||||
if (is_boundary)
|
||||
out = f0[idx];
|
||||
f1[idx] = out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double xmin, double ymin, double zmin,
|
||||
@@ -1032,6 +1145,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
int symmetry,
|
||||
int lev,
|
||||
int rk_stage,
|
||||
bool force_host_boundary_fix,
|
||||
bool download_to_host)
|
||||
{
|
||||
Rk4VarCache &cache = rk4_var_cache_map()[state0];
|
||||
@@ -1166,24 +1280,34 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev == 0)
|
||||
if (ok && (lev == 0 || !force_host_boundary_fix))
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev > 0)
|
||||
if (ok && lev > 0 && !force_host_boundary_fix)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
|
||||
void *args[] = {&nx, &ny, &nz,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
&d_state0, &d_stage};
|
||||
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev > 0 && force_host_boundary_fix)
|
||||
{
|
||||
double *host_state0 = const_cast<double *>(state0);
|
||||
double *host_phi = const_cast<double *>(phi_field);
|
||||
double *host_lap = const_cast<double *>(lap_field);
|
||||
double *host_rhs = rhs_accum;
|
||||
|
||||
ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
|
||||
sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
|
||||
sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
|
||||
sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
|
||||
// state0/phi/lap are read-only during the current RK step, so the host copies
|
||||
// remain valid even if cached device mirrors exist. Only the RHS accumulator
|
||||
// is updated on device and must be synchronized back for the CPU fallback.
|
||||
ok = sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(stage_data, n);
|
||||
@@ -1232,6 +1356,176 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double xmin, double ymin, double zmin,
|
||||
double xmax, double ymax, double zmax,
|
||||
int symmetry,
|
||||
const double *const *state0_list,
|
||||
double *const *stage_data_list,
|
||||
double *const *rhs_accum_list,
|
||||
int num_var,
|
||||
int rk_stage,
|
||||
bool download_to_host)
|
||||
{
|
||||
if (!state0_list || !stage_data_list || !rhs_accum_list || num_var <= 0)
|
||||
return 1;
|
||||
|
||||
const int nx = ex[0];
|
||||
const int ny = ex[1];
|
||||
const int nz = ex[2];
|
||||
const int n = count_points(ex);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
std::vector<const double *> host_state0_ptrs(num_var);
|
||||
std::vector<double *> host_stage_ptrs(num_var);
|
||||
std::vector<double *> host_rhs_ptrs(num_var);
|
||||
|
||||
bool ok = true;
|
||||
for (int v = 0; v < num_var && ok; ++v)
|
||||
{
|
||||
const double *state0 = state0_list[v];
|
||||
double *stage_data = stage_data_list[v];
|
||||
double *rhs_accum = rhs_accum_list[v];
|
||||
Rk4VarCache &cache = rk4_var_cache_map()[state0];
|
||||
|
||||
const bool refresh_state0 =
|
||||
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
|
||||
const bool refresh_rhs =
|
||||
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
||||
const bool need_stage_input = (rk_stage != 0);
|
||||
double *stage_ptr = nullptr;
|
||||
const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
|
||||
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
|
||||
const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;
|
||||
|
||||
if (refresh_state0 && !mapped_state0_ptr)
|
||||
bssn_gpu_prepare_host_buffer(state0, n);
|
||||
if (need_stage_input && !mapped_stage_ptr)
|
||||
bssn_gpu_prepare_host_buffer(stage_data, n);
|
||||
if (refresh_rhs && !mapped_rhs_ptr)
|
||||
bssn_gpu_prepare_host_buffer(rhs_accum, n);
|
||||
|
||||
ok = (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
||||
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
||||
if (!ok)
|
||||
break;
|
||||
|
||||
if (need_stage_input)
|
||||
{
|
||||
if (mapped_stage_ptr)
|
||||
{
|
||||
stage_ptr = const_cast<double *>(mapped_stage_ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
|
||||
stage_ptr = cache.stage.ptr;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ok = ensure_capacity(cache.stage, bytes);
|
||||
stage_ptr = cache.stage.ptr;
|
||||
}
|
||||
if (!ok)
|
||||
break;
|
||||
|
||||
if (refresh_state0)
|
||||
{
|
||||
cache.host_state0 = state0;
|
||||
cache.nx = nx;
|
||||
cache.ny = ny;
|
||||
cache.nz = nz;
|
||||
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
|
||||
}
|
||||
if (refresh_rhs)
|
||||
{
|
||||
cache.host_rhs = rhs_accum;
|
||||
cache.rhs_resident = true;
|
||||
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
|
||||
}
|
||||
|
||||
host_state0_ptrs[v] = cache.state0.ptr;
|
||||
host_stage_ptrs[v] = stage_ptr;
|
||||
host_rhs_ptrs[v] = cache.rhs.ptr;
|
||||
}
|
||||
|
||||
if (!ok)
|
||||
return 1;
|
||||
|
||||
Rk4BatchCache &batch_cache = rk4_batch_cache();
|
||||
ok = ensure_capacity(batch_cache.state0_ptrs, ptr_bytes) &&
|
||||
ensure_capacity(batch_cache.stage_ptrs, ptr_bytes) &&
|
||||
ensure_capacity(batch_cache.rhs_ptrs, ptr_bytes);
|
||||
if (!ok)
|
||||
return 1;
|
||||
|
||||
cudaError_t err = cudaMemcpy(batch_cache.state0_ptrs.ptr, &host_state0_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) batch state0 ptrs", err);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(batch_cache.stage_ptrs.ptr, &host_stage_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) batch stage ptrs", err);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(batch_cache.rhs_ptrs.ptr, &host_rhs_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) batch rhs ptrs", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
double dX = X[1] - X[0];
|
||||
double dY = Y[1] - Y[0];
|
||||
double dZ = Z[1] - Z[0];
|
||||
const int no_symm = 0, octant = 2;
|
||||
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
||||
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
||||
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
||||
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
||||
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
||||
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
||||
|
||||
int n_arg = n, nx_arg = nx, ny_arg = ny, nz_arg = nz;
|
||||
int num_var_arg = num_var, rk_stage_arg = rk_stage;
|
||||
void *args[] = {&n_arg, &nx_arg, &ny_arg, &nz_arg,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
&num_var_arg, &dT,
|
||||
&batch_cache.state0_ptrs.ptr,
|
||||
&batch_cache.stage_ptrs.ptr,
|
||||
&batch_cache.rhs_ptrs.ptr,
|
||||
&rk_stage_arg};
|
||||
ok = launch_kernel(grid, block, (const void *)rk4_boundary_batch_kernel, args);
|
||||
|
||||
if (!ok)
|
||||
return 1;
|
||||
|
||||
for (int v = 0; v < num_var; ++v)
|
||||
{
|
||||
bssn_gpu_register_device_buffer(stage_data_list[v], host_stage_ptrs[v]);
|
||||
if (download_to_host)
|
||||
{
|
||||
err = cudaMemcpy(stage_data_list[v], host_stage_ptrs[v], bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) batch stage_data", err);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bssn_cuda_release_rk4_caches()
|
||||
{
|
||||
std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
|
||||
@@ -1248,6 +1542,9 @@ void bssn_cuda_release_rk4_caches()
|
||||
release_buffer(cache.rhs);
|
||||
}
|
||||
cache_map.clear();
|
||||
release_buffer(rk4_batch_cache().state0_ptrs);
|
||||
release_buffer(rk4_batch_cache().stage_ptrs);
|
||||
release_buffer(rk4_batch_cache().rhs_ptrs);
|
||||
}
|
||||
|
||||
void bssn_cuda_release_interp_caches()
|
||||
|
||||
Reference in New Issue
Block a user