Reduce GPU RK4 transfer overhead

This commit is contained in:
2026-04-09 12:11:40 +08:00
parent 49409645c0
commit e1e3b4a448
3 changed files with 69 additions and 155 deletions

View File

@@ -461,20 +461,21 @@ int bssn_cuda_enforce_ga(int *ex,
if (ok)
{
cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
ok = err == cudaSuccess;
if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
// The next GPU RHS stage consumes these fields immediately.
// Keep them device-resident and expose the mapping so gpu_rhs() can
// reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
}
return ok ? 0 : 1;
@@ -549,15 +550,31 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
const bool refresh_rhs =
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
double *stage_ptr = nullptr;
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
ok = ok &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
(!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) &&
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
if (ok && !need_stage_input)
if (ok && need_stage_input)
{
if (mapped_stage_ptr)
{
stage_ptr = const_cast<double *>(mapped_stage_ptr);
}
else
{
ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
stage_ptr = cache.stage.ptr;
}
}
else if (ok)
{
ok = ensure_capacity(cache.stage, bytes);
stage_ptr = cache.stage.ptr;
}
if (!ok)
return 1;
@@ -599,8 +616,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
double *d_rhs = cache.rhs.ptr;
double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
@@ -615,14 +632,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
if (ok)
{
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
}
if (ok && lev > 0)
{
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr;
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
void *args[] = {&nx, &ny, &nz,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
@@ -632,9 +649,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
if (ok)
{
bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr);
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
}
@@ -651,18 +668,30 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
bool ok = copy_to_device_preferring_device(d_chi, chi, bytes);
double *device_chi = nullptr;
const double *mapped = bssn_gpu_find_device_buffer(chi);
bool ok = true;
if (mapped)
{
device_chi = const_cast<double *>(mapped);
}
else
{
ok = copy_to_device_preferring_device(d_chi, chi, bytes);
device_chi = d_chi.ptr;
}
if (ok)
{
double *ptr = d_chi.ptr;
double *ptr = device_chi;
void *args[] = {&n, &ptr, &tinny};
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
}
if (ok)
{
cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost);
bssn_gpu_register_device_buffer(chi, device_chi);
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
}