Trim GPU main-path transfer overhead

This commit is contained in:
2026-04-08 20:16:25 +08:00
parent 01ac1f9250
commit a0af9b8804
2 changed files with 74 additions and 48 deletions

View File

@@ -493,9 +493,16 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
dim3 block(256); dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x))); dim3 grid(div_up(n, static_cast<int>(block.x)));
const bool need_bam_boundary = (lev == 0);
const bool need_coord_copy = need_bam_boundary;
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
const bool need_stage_input = (rk_stage != 0);
const bool need_rhs_output = (rk_stage != 3);
bool ok = true; bool ok = true;
if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || if (need_coord_copy &&
cache.nx != nx || cache.ny != ny || cache.nz != nz) (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
cache.nx != nx || cache.ny != ny || cache.nz != nz))
{ {
ok = copy_to_device(cache.X, X, bytes_x) && ok = copy_to_device(cache.X, X, bytes_x) &&
copy_to_device(cache.Y, Y, bytes_y) && copy_to_device(cache.Y, Y, bytes_y) &&
@@ -512,10 +519,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
} }
ok = ok && ok = ok &&
copy_to_device(cache.state0, state0, bytes) && copy_to_device(cache.state0, state0, bytes) &&
copy_to_device(cache.boundary, boundary_src, bytes) && (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
copy_to_device(cache.stage, stage_data, bytes) && (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
copy_to_device(cache.rhs, rhs_accum, bytes); copy_to_device(cache.rhs, rhs_accum, bytes);
if (ok && !need_stage_input)
ok = ensure_capacity(cache.stage, bytes);
if (!ok) if (!ok)
return 1; return 1;
@@ -534,7 +544,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
double soa1 = SoA[1]; double soa1 = SoA[1];
double soa2 = SoA[2]; double soa2 = SoA[2];
if (lev == 0) if (need_bam_boundary)
{ {
int imin = 1; int imin = 1;
int jmin = 1; int jmin = 1;
@@ -581,7 +591,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess; ok = err == cudaSuccess;
if (ok) if (ok && need_rhs_output)
{ {
err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost); err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);

View File

@@ -134,6 +134,7 @@ struct GpuRhsCache
const double *last_x = nullptr; const double *last_x = nullptr;
const double *last_y = nullptr; const double *last_y = nullptr;
const double *last_z = nullptr; const double *last_z = nullptr;
bool meta_uploaded = false;
}; };
GpuRhsCache &gpu_rhs_cache() GpuRhsCache &gpu_rhs_cache()
@@ -231,9 +232,31 @@ bool register_gpu_rhs_cleanup()
return true; return true;
} }
void ensure_gpu_rhs_invariant_symbols()
{
static bool initialized = false;
if (initialized)
return;
double F1o3h = 1.0 / 3.0;
double F2o3h = 2.0 / 3.0;
double F1o6h = 1.0 / 6.0;
double PIh = M_PI;
int step = GRID_DIM * BLOCK_DIM;
cudaMemcpyToSymbol(F1o3, &F1o3h, sizeof(double));
cudaMemcpyToSymbol(F2o3, &F2o3h, sizeof(double));
cudaMemcpyToSymbol(F1o6, &F1o6h, sizeof(double));
cudaMemcpyToSymbol(PI, &PIh, sizeof(double));
cudaMemcpyToSymbol(STEP_SIZE, &step, sizeof(int));
initialized = true;
}
bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
{ {
register_gpu_rhs_cleanup(); register_gpu_rhs_cleanup();
ensure_gpu_rhs_invariant_symbols();
const bool shape_changed = const bool shape_changed =
!cache.allocated || !cache.allocated ||
@@ -261,6 +284,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
cache.last_x = nullptr; cache.last_x = nullptr;
cache.last_y = nullptr; cache.last_y = nullptr;
cache.last_z = nullptr; cache.last_z = nullptr;
cache.meta_uploaded = false;
Meta *meta = &cache.meta; Meta *meta = &cache.meta;
const int matrix_size = cache.matrix_size; const int matrix_size = cache.matrix_size;
@@ -446,7 +470,24 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
return false; return false;
} }
cudaMemcpyToSymbol(metac, meta, sizeof(Meta));
int _1d_size[4];
int _2d_size[4];
int _3d_size[4];
for (int i = 0; i < 4; ++i)
{
_1d_size[i] = ex[0] + i;
_2d_size[i] = _1d_size[i] * (ex[1] + i);
_3d_size[i] = _2d_size[i] * (ex[2] + i);
}
cudaMemcpyToSymbol(ex_c, ex, 3 * sizeof(int));
cudaMemcpyToSymbol(_1D_SIZE, _1d_size, 4 * sizeof(int));
cudaMemcpyToSymbol(_2D_SIZE, _2d_size, 4 * sizeof(int));
cudaMemcpyToSymbol(_3D_SIZE, _3d_size, 4 * sizeof(int));
cache.allocated = true; cache.allocated = true;
cache.meta_uploaded = true;
return true; return true;
} }
@@ -2989,46 +3030,21 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#endif//if (GAUGE == 6 || GAUGE == 7) #endif//if (GAUGE == 6 || GAUGE == 7)
//3.1-----for compute_rhs_bssn--------- //3.1-----for compute_rhs_bssn---------
//cout<<"Size of Meta:"<<sizeof(Meta)<<endl; //cout<<"Size of Meta:"<<sizeof(Meta)<<endl;
cudaMemcpyToSymbol(metac,meta, sizeof(Meta)); cudaMemcpyToSymbol(T_c,&T, sizeof(double));
cudaMemcpyToSymbol(ex_c,ex, 3*sizeof(int)); cudaMemcpyToSymbol(Symmetry_c,&Symmetry, sizeof(int));
cudaMemcpyToSymbol(T_c,&T, sizeof(double)); cudaMemcpyToSymbol(Lev_c,&Lev, sizeof(int));
cudaMemcpyToSymbol(Symmetry_c,&Symmetry, sizeof(int)); cudaMemcpyToSymbol(co_c,&co, sizeof(int));
cudaMemcpyToSymbol(Lev_c,&Lev, sizeof(int)); cudaMemcpyToSymbol(eps_c,&eps, sizeof(double));
cudaMemcpyToSymbol(co_c,&co, sizeof(int));
cudaMemcpyToSymbol(eps_c,&eps, sizeof(double)); double dXh = X[1] - X[0];
double dYh = Y[1] - Y[0];
double F1o3h = 1.0; F1o3h /= 3.0; double dZh = Z[1] - Z[0];
double F2o3h = 2.0; F2o3h /= 3.0;
double F1o6h = 1.0; F1o6h /= 6.0; cudaMemcpyToSymbol(dX,&dXh, sizeof(double));
double PIh = M_PI; cudaMemcpyToSymbol(dY,&dYh, sizeof(double));
int step = GRID_DIM * BLOCK_DIM; cudaMemcpyToSymbol(dZ,&dZh, sizeof(double));
double dXh = X[1] - X[0];
double dYh = Y[1] - Y[0];
double dZh = Z[1] - Z[0];
cudaMemcpyToSymbol(F1o3,&F1o3h, sizeof(double));
cudaMemcpyToSymbol(F2o3,&F2o3h, sizeof(double));
cudaMemcpyToSymbol(F1o6,&F1o6h, sizeof(double));
cudaMemcpyToSymbol(PI,&PIh, sizeof(double));
cudaMemcpyToSymbol(STEP_SIZE,&step, sizeof(int));
cudaMemcpyToSymbol(dX,&dXh, sizeof(double));
cudaMemcpyToSymbol(dY,&dYh, sizeof(double));
cudaMemcpyToSymbol(dZ,&dZh, sizeof(double));
int _1d_size[4];
int _2d_size[4];
int _3d_size[4];
for(int i = 0;i<4;++i){
_1d_size[i] = ex[0] + i;
_2d_size[i] = _1d_size[i] * (ex[1]+i);
_3d_size[i] = _2d_size[i] * (ex[2]+i);
//cout<<_1d_size[i]<<' '<<_2d_size[i]<<' '<<_3d_size[i]<<endl;
}
cudaMemcpyToSymbol(_1D_SIZE,_1d_size, 4*sizeof(int));
cudaMemcpyToSymbol(_2D_SIZE,_2d_size, 4*sizeof(int));
cudaMemcpyToSymbol(_3D_SIZE,_3d_size, 4*sizeof(int));
//3.2--------for fderivs------------ //3.2--------for fderivs------------