diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 9cabec9..740640d 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1324,6 +1324,12 @@ int cuda_data_packer_device_batched(double *data, while (src && dst) { + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) + { + src = src->next; + dst = dst->next; + continue; + } const bool active = (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank); @@ -1433,12 +1439,16 @@ bool cuda_segments_device_eligible(MyList *src, bool has_work = false; while (src && dst) { + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) + { + src = src->next; + dst = dst->next; + continue; + } if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) { has_work = true; - if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) - return false; int type; if (src->data->Bg->lev == dst->data->Bg->lev) type = 1; diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index cc29f44..e1cf9d0 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -129,6 +129,17 @@ MyList *clone_var_list_prefix(MyList *src, int count) return dst; } +bool escalar_gpu_rk_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_GPU_RK"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + void clear_var_list(MyList *&list) { if (list) @@ -175,6 +186,7 @@ int run_bssn_escalar_cuda_substep(Block *cg, int &co, double &chitiny, var *Sphi_in, var *Spi_in, + var *Sphi_out, var *Spi_out, var *Sphi_rhs, var *Spi_rhs, var *rho, var *Sx, var *Sy, var *Sz, var *Sxx, var *Sxy, var *Sxz, @@ -220,6 +232,26 @@ int run_bssn_escalar_cuda_substep(Block *cg, apply_bam_bc = (lev == 0) ? 1 : 0; #endif #endif + if (escalar_gpu_rk_enabled()) + { + double scalar_propspeed[2] = { + Sphi_in->propspeed, Spi_in->propspeed + }; + double scalar_soa[6] = { + Sphi_in->SoA[0], Sphi_in->SoA[1], Sphi_in->SoA[2], + Spi_in->SoA[0], Spi_in->SoA[1], Spi_in->SoA[2] + }; + if (bssn_cuda_escalar_finalize_scalar_fields(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[Sphi_out->sgfn], + cg->fgfs[Spi_out->sgfn], + scalar_propspeed, + scalar_soa, + patch->bbox, + dT_lev, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, co)) + return 1; + } int use_zero_matter = 0; int keep_resident_state = 1; double **matter_precomputed = nullptr; @@ -1003,7 +1035,7 @@ void bssnEScalar_class::Step(int lev, int YN) (run_bssn_escalar_cuda_substep(cg, StateList, SynchList_pre, Pp->data, dT_lev, TRK4, iter_count, Symmetry, lev, ndeps, pre, chitiny, - Sphi0, Spi0, Sphi_rhs, Spi_rhs, + Sphi0, Spi0, Sphi, Spi, Sphi_rhs, Spi_rhs, rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0)) ? 0 : 1) || @@ -1058,12 +1090,37 @@ void bssnEScalar_class::Step(int lev, int YN) #if USE_CUDA_BSSN if (used_gpu_substep) skip_bssn_cuda_prefix(varl0, varl, varlrhs); +#endif + const bool scalar_gpu_rk_done = +#if USE_CUDA_BSSN + used_gpu_substep && escalar_gpu_rk_enabled(); +#else + false; #endif while (varl0) - { -#ifndef WithShell - if (lev == 0) // sommerfeld indeed - f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], + { + if (scalar_gpu_rk_done) + { +#ifndef WithShell + if (lev > 0) // fix BD point +#endif + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + + varl0 = varl0->next; + varl = varl->next; + varlrhs = varlrhs->next; + continue; + } +#ifndef WithShell + if (lev == 0) // sommerfeld indeed + f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], cg->fgfs[varlrhs->data->sgfn], @@ -1369,7 +1426,7 @@ void bssnEScalar_class::Step(int lev, int YN) (run_bssn_escalar_cuda_substep(cg, SynchList_pre, SynchList_cor, Pp->data, dT_lev, TRK4, iter_count, Symmetry, lev, ndeps, cor, chitiny, - Sphi, Spi, Sphi_rhs, Spi_rhs, + Sphi, Spi, Sphi1, Spi1, Sphi_rhs, Spi_rhs, rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0)) ? 0 : 1) || @@ -1426,12 +1483,38 @@ void bssnEScalar_class::Step(int lev, int YN) if (used_gpu_substep) skip_bssn_cuda_prefix(varl0, varl, varl1, varlrhs); #endif + const bool scalar_gpu_rk_done = +#if USE_CUDA_BSSN + used_gpu_substep && escalar_gpu_rk_enabled(); +#else + false; +#endif while (varl0) - { -#ifndef WithShell - if (lev == 0) // sommerfeld indeed - f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], + { + if (scalar_gpu_rk_done) + { +#ifndef WithShell + if (lev > 0) // fix BD point +#endif + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + + varl0 = varl0->next; + varl = varl->next; + varl1 = varl1->next; + varlrhs = varlrhs->next; + continue; + } +#ifndef WithShell + if (lev == 0) // sommerfeld indeed + f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], cg->fgfs[varl1->data->sgfn], diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index f3a5b26..bed3fad 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -393,7 +393,8 @@ static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20}; bool fill_bssn_cuda_views(Block *cg, MyList *vars, double **host_views, double *propspeeds = nullptr, - double *soa_flat = nullptr) + double *soa_flat = nullptr, + bool allow_trailing_vars = false) { int idx = 0; while (vars && idx < BSSN_CUDA_STATE_COUNT) @@ -410,7 +411,7 @@ bool fill_bssn_cuda_views(Block *cg, MyList *vars, vars = vars->next; ++idx; } - return idx == BSSN_CUDA_STATE_COUNT && vars == 0; + return idx == BSSN_CUDA_STATE_COUNT && (allow_trailing_vars || vars == 0); } bool bssn_cuda_use_resident_sync(int lev) @@ -687,7 +688,7 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { double *state_out[BSSN_CUDA_STATE_COUNT]; - if (!fill_bssn_cuda_views(cg, vars, state_out)) + if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true)) { cout << "CUDA BSSN state list mismatch on resident state download" << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -720,7 +721,7 @@ void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { double *state_out[BSSN_CUDA_STATE_COUNT]; - if (!fill_bssn_cuda_views(cg, vars, state_out)) + if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true)) { cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl; MPI_Abort(MPI_COMM_WORLD, 1); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 90bad7b..0c7b5e0 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() { static int enabled = -1; if (enabled < 0) { const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS"); - enabled = (!env || atoi(env) != 0) ? 1 : 0; + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static bool escalar_gpu_rk_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_ESCALAR_GPU_RK"); + enabled = (env && atoi(env) != 0) ? 1 : 0; } return enabled != 0; } @@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { struct StepContext { double *d_state0_mem; double *d_accum_mem; + double *d_escalar0_mem; + double *d_escalar_accum_mem; double *d_state_curr_mem; double *d_state_next_mem; std::array d_resident_mem; @@ -596,6 +607,8 @@ struct StepContext { double *h_comm_mem; std::array d_state0; std::array d_accum; + std::array d_escalar0; + std::array d_escalar_accum; std::array d_state_curr; std::array d_state_next; std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; @@ -615,6 +628,7 @@ struct StepContext { StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), + d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), d_resident_mem{}, d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), @@ -625,6 +639,8 @@ struct StepContext { d_resident_mem.fill(nullptr); d_state0.fill(nullptr); d_accum.fill(nullptr); + d_escalar0.fill(nullptr); + d_escalar_accum.fill(nullptr); d_state_curr.fill(nullptr); d_state_next.fill(nullptr); for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -641,6 +657,8 @@ struct StepContext { struct StepAllocation { double *d_state0_mem; double *d_accum_mem; + double *d_escalar0_mem; + double *d_escalar_accum_mem; std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; @@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation() StepAllocation alloc = {}; alloc.d_state0_mem = nullptr; alloc.d_accum_mem = nullptr; + alloc.d_escalar0_mem = nullptr; + alloc.d_escalar_accum_mem = nullptr; alloc.d_resident_mem.fill(nullptr); alloc.d_matter_mem = nullptr; alloc.d_comm_mem = nullptr; @@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx) StepAllocation alloc = {}; alloc.d_state0_mem = ctx.d_state0_mem; alloc.d_accum_mem = ctx.d_accum_mem; + alloc.d_escalar0_mem = ctx.d_escalar0_mem; + alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem; alloc.d_resident_mem = ctx.d_resident_mem; alloc.d_matter_mem = ctx.d_matter_mem; alloc.d_comm_mem = ctx.d_comm_mem; @@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx) alloc.cap_h_comm = ctx.cap_h_comm; ctx.d_state0_mem = nullptr; ctx.d_accum_mem = nullptr; + ctx.d_escalar0_mem = nullptr; + ctx.d_escalar_accum_mem = nullptr; ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; ctx.d_resident_mem.fill(nullptr); @@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.resident_clock = 0; ctx.d_state0.fill(nullptr); ctx.d_accum.fill(nullptr); + ctx.d_escalar0.fill(nullptr); + ctx.d_escalar_accum.fill(nullptr); ctx.d_state_curr.fill(nullptr); ctx.d_state_next.fill(nullptr); for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc { ctx.d_state0_mem = alloc.d_state0_mem; ctx.d_accum_mem = alloc.d_accum_mem; + ctx.d_escalar0_mem = alloc.d_escalar0_mem; + ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem; ctx.d_resident_mem = alloc.d_resident_mem; ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; @@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all; } } + if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) { + for (int i = 0; i < 2; ++i) { + ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all; + ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all; + } + } if (ctx.current_bank >= 0) { ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank]; ctx.d_state_curr = ctx.d_resident[ctx.current_bank]; @@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) return ctx; } +static void ensure_escalar_buffers(StepContext &ctx, size_t all) +{ + if (!ctx.d_escalar0_mem) + CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double))); + if (!ctx.d_escalar_accum_mem) + CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double))); + for (int i = 0; i < 2; ++i) { + ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all; + ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all; + } +} + static void release_step_ctx(void *block_tag) { auto it = g_step_ctx.find(block_tag); @@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag, ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6], ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9], a2); - CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaDeviceSynchronize()); + if (!escalar_gpu_rk_enabled()) { + CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); + } ctx.matter_ready = true; (void)Lev; return 0; } +extern "C" +int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double *Sphi_out_host, + double *Spi_out_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &precor) +{ + if (!escalar_gpu_rk_enabled()) + return 1; + if (RK4 < 0 || RK4 > 3) + return 1; + + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor); + StepContext &ctx = ensure_step_ctx(block_tag, all); + ensure_escalar_buffers(ctx, all); + + if (RK4 == 0) { + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr], + bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr], + bytes, cudaMemcpyDeviceToDevice)); + } + + if (apply_bam_bc) { + gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa], + propspeed[0], + soa_flat[0], soa_flat[1], soa_flat[2], + X, Y, Z, bbox, Symmetry); + gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya], + propspeed[1], + soa_flat[3], soa_flat[4], soa_flat[5], + X, Y, Z, bbox, Symmetry); + } + + kern_rk4_finalize<<>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa], + ctx.d_escalar_accum[0], dT, RK4); + kern_rk4_finalize<<>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya], + ctx.d_escalar_accum[1], dT, RK4); + + try_pin_escalar_host_buffer(Sphi_out_host, bytes); + try_pin_escalar_host_buffer(Spi_out_host, bytes); + CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); + (void)Lev; + return 0; +} + extern "C" int bssn_cuda_rk4_substep(void *block_tag, int *ex, double *X, double *Y, double *Z, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index db903fa..7ac103b 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -69,6 +69,21 @@ int bssn_cuda_compute_escalar_matter(void *block_tag, int &co, int &apply_enforce_ga); +int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double *Sphi_out_host, + double *Spi_out_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &precor); + int bssn_cuda_copy_state_region_to_host(void *block_tag, int state_index, double *host_state, diff --git a/makefile_and_run.py b/makefile_and_run.py index c3f5dd6..3289471 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -151,6 +151,8 @@ def _gpu_runtime_env(): "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", + "AMSS_CUDA_PIN_ESCALAR_TRANSFERS": "0", + "AMSS_ESCALAR_GPU_RK": "0", } if getattr(input_data, "Equation_Class", "") == "Z4C": defaults["AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"] = "0" @@ -287,6 +289,8 @@ def run_ABE(): print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}") print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}") + print(f" AMSS_CUDA_PIN_ESCALAR_TRANSFERS={mpi_env.get('AMSS_CUDA_PIN_ESCALAR_TRANSFERS', '')}") + print(f" AMSS_ESCALAR_GPU_RK={mpi_env.get('AMSS_ESCALAR_GPU_RK', '')}") if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env: print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")