Trim GPU main-path transfer overhead
This commit is contained in:
@@ -493,9 +493,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
const bool need_bam_boundary = (lev == 0);
|
||||
const bool need_coord_copy = need_bam_boundary;
|
||||
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
|
||||
const bool need_stage_input = (rk_stage != 0);
|
||||
const bool need_rhs_output = (rk_stage != 3);
|
||||
|
||||
bool ok = true;
|
||||
if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||
cache.nx != nx || cache.ny != ny || cache.nz != nz)
|
||||
if (need_coord_copy &&
|
||||
(cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||
cache.nx != nx || cache.ny != ny || cache.nz != nz))
|
||||
{
|
||||
ok = copy_to_device(cache.X, X, bytes_x) &&
|
||||
copy_to_device(cache.Y, Y, bytes_y) &&
|
||||
@@ -512,10 +519,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
}
|
||||
|
||||
ok = ok &&
|
||||
copy_to_device(cache.state0, state0, bytes) &&
|
||||
copy_to_device(cache.boundary, boundary_src, bytes) &&
|
||||
copy_to_device(cache.stage, stage_data, bytes) &&
|
||||
copy_to_device(cache.rhs, rhs_accum, bytes);
|
||||
copy_to_device(cache.state0, state0, bytes) &&
|
||||
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
||||
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
|
||||
copy_to_device(cache.rhs, rhs_accum, bytes);
|
||||
|
||||
if (ok && !need_stage_input)
|
||||
ok = ensure_capacity(cache.stage, bytes);
|
||||
|
||||
if (!ok)
|
||||
return 1;
|
||||
@@ -534,7 +544,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
double soa1 = SoA[1];
|
||||
double soa2 = SoA[2];
|
||||
|
||||
if (lev == 0)
|
||||
if (need_bam_boundary)
|
||||
{
|
||||
int imin = 1;
|
||||
int jmin = 1;
|
||||
@@ -581,7 +591,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (ok)
|
||||
if (ok && need_rhs_output)
|
||||
{
|
||||
err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
||||
|
||||
Reference in New Issue
Block a user