Reduce staged GPU host-device copies

This commit is contained in:
2026-04-09 16:44:08 +08:00
parent 42e851d19a
commit 5b00d49070
5 changed files with 349 additions and 49 deletions

View File

@@ -728,7 +728,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double SoA[3],
int symmetry,
int lev,
int rk_stage)
int rk_stage,
bool download_to_host)
{
struct Rk4VarCache
{
@@ -790,7 +791,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
ok = ok &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
if (ok && need_stage_input)
@@ -885,16 +886,18 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
if (ok)
{
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
if (download_to_host)
{
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
}
}
return ok ? 0 : 1;
}
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
{
static thread_local CachedBuffer d_chi;
@@ -926,13 +929,32 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
if (ok)
{
bssn_gpu_register_device_buffer(chi, device_chi);
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
if (download_to_host)
{
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
}
}
return ok ? 0 : 1;
}
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
{
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
if (!device_ptr)
return 1;
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
return 1;
}
return 0;
}
int bssn_cuda_interp_points_batch(const int *ex,
const double *X, const double *Y, const double *Z,
const double *const *fields,