diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 7592cea..ecbac7a 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -95,7 +95,7 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int while (BP) { Block *cg = BP->data; - if (myrank == cg->rank) + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { double *state_out[BSSN_CUDA_STATE_COUNT]; if (!fill_bssn_cuda_views(cg, vars, state_out)) @@ -3301,18 +3301,21 @@ void bssn_class::Step(int lev, int YN) cg->fgfs[varlrhs->data->sgfn], iter_count); #endif + if (!used_gpu_substep) + { #ifndef WithShell - if (lev > 0) // fix BD point + if (lev > 0) // fix BD point #endif - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], - varl0->data->SoA, - Symmetry, cor); + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + } #if (SommerType == 1) #warning "shell part still bam type" @@ -3333,7 +3336,7 @@ void bssn_class::Step(int lev, int YN) varlrhs = varlrhs->next; } } - if (!used_gpu_resident_state) + if (!used_gpu_substep) f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); } if (BP == Pp->data->ble) @@ -3722,18 +3725,21 @@ void bssn_class::Step(int lev, int YN) iter_count); #endif + if (!used_gpu_substep) + { #ifndef WithShell - if (lev > 0) // fix BD point + if (lev > 0) // fix BD point #endif - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], - varl0->data->SoA, - Symmetry, cor); + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + } #if (SommerType == 1) if (lev == 1) // shibata type sommerfeld @@ -3754,7 +3760,7 @@ void bssn_class::Step(int lev, int YN) varlrhs = varlrhs->next; } } - if (!used_gpu_resident_state) + if (!used_gpu_substep) f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); } if (BP == Pp->data->ble) @@ -8636,4 +8642,3 @@ bool bssn_class::check_Stdin_Abort() } //================================================================================================ - diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index f709e48..abcad51 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -1216,6 +1216,11 @@ struct Rk4FinalizeTables { double *accum_fields[BSSN_STATE_COUNT]; }; +struct PatchBoundaryTables { + const double *src_fields[BSSN_STATE_COUNT]; + double *dst_fields[BSSN_STATE_COUNT]; +}; + static const int BLK = 128; static inline int grid(size_t n) { if (n == 0) return 1; @@ -1591,6 +1596,56 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx, kern_rk4_finalize_batched<<>>(tables, dT, rk4_stage, chitiny); } +__global__ __launch_bounds__(128, 4) +void kern_copy_patch_boundary_batched(PatchBoundaryTables tables, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int nx = d_gp.ex[0]; + const int ny = d_gp.ex[1]; + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + const bool on_boundary = + (touch_xmin && i0 == 0) || + (touch_xmax && i0 == nx - 1) || + (touch_ymin && j0 == 0) || + (touch_ymax && j0 == ny - 1) || + (touch_zmin && k0 == 0) || + (touch_zmax && k0 == d_gp.ex[2] - 1); + if (!on_boundary) return; + + const int field = blockIdx.y; + tables.dst_fields[field][tid] = tables.src_fields[field][tid]; +} + +static void gpu_copy_patch_boundary_batch(int all, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax)) + return; + + PatchBoundaryTables tables = {}; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]]; + tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]]; + } + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT); + kern_copy_patch_boundary_batched<<>>( + tables, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); +} + __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, double * __restrict__ gxy, double * __restrict__ gxz, @@ -3008,6 +3063,32 @@ static void setup_grid_params(int *ex, upload_grid_params_if_needed(gp); } +static void compute_patch_boundary_flags(int *ex, + double *X, double *Y, double *Z, + const double *bbox, + int Symmetry, + int &touch_xmin, int &touch_xmax, + int &touch_ymin, int &touch_ymax, + int &touch_zmin, int &touch_zmax) +{ + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const int NO_SYMM = 0; + const int OCTANT = 2; + + touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0; + touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0; + touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0; + + touch_xmin = (std::fabs(X[0] - bbox[0]) < dX && + !(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0; + touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY && + !(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0; + touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ && + !(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0; +} + static void upload_state_inputs(double **state_host, size_t all) { const size_t bytes = all * sizeof(double); @@ -3922,8 +4003,17 @@ int bssn_cuda_rk4_substep(void *block_tag, const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); + int touch_xmin = 0, touch_xmax = 0; + int touch_ymin = 0, touch_ymax = 0; + int touch_zmin = 0, touch_zmax = 0; setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + if (Lev > 0) { + compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } StepContext &ctx = ensure_step_ctx(block_tag, all); const bool use_resident_state = (keep_resident_state != 0); if (use_resident_state) { @@ -3986,6 +4076,12 @@ int bssn_cuda_rk4_substep(void *block_tag, t0 = profile ? cuda_profile_now_ms() : 0.0; gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); + if (Lev > 0) { + gpu_copy_patch_boundary_batch((int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } if (profile) { cuda_profile_sync(); finalize_ms += cuda_profile_now_ms() - t0;