diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 7dc3a34..7010186 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -193,6 +193,28 @@ bool cuda_build_bssn_host_views(Block *block, } #endif +#if USE_CUDA_Z4C && (ABEtype == 2) +bool cuda_build_z4c_host_views(Block *block, + MyList *vars, + int state_count, + double **views) +{ + if (!block || !vars || !views || state_count != Z4C_CUDA_STATE_COUNT) + return false; + MyList *v = vars; + for (int i = 0; i < Z4C_CUDA_STATE_COUNT; ++i) + { + if (!v) + return false; + views[i] = block->fgfs[v->data->sgfn]; + if (!views[i]) + return false; + v = v->next; + } + return v == 0; +} +#endif + bool cuda_build_state_soa(MyList *vars, int state_count, double *soa_flat) @@ -220,6 +242,8 @@ int fortran_idint(double x) bool cuda_amr_restrict_device_enabled(); bool cuda_amr_prolong_device_enabled(); +bool cuda_z4c_amr_prolong_device_enabled(); +bool cuda_z4c_amr_unpack_device_enabled(); bool cuda_amr_restrict_compare_enabled(); bool cuda_amr_restrict_batch_enabled(); bool cuda_device_segment_batch_enabled(); @@ -324,7 +348,8 @@ bool cuda_state_count_direct_supported(int state_count) #endif } -bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) +bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, + int type, MyList *VarLists = 0) { if (!src || !dst || !src->Bg) return false; @@ -342,7 +367,7 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg } if (type == 2 && !cuda_amr_restrict_device_enabled()) return false; - if (type == 3 && !cuda_amr_prolong_device_enabled()) + if (type == 3 && (!cuda_amr_prolong_device_enabled() || !cuda_z4c_amr_prolong_device_enabled())) return false; if (type == 2) { int a[3]; @@ -354,7 +379,17 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg if (!cuda_cell_gw3_prolong_params(src, dst, a, b)) return false; } - return z4c_cuda_has_resident_state(src->Bg) != 0; + if (z4c_cuda_has_resident_state(src->Bg) == 0) + return false; + if (type != 1 && VarLists) + { + double *view_ptrs[Z4C_CUDA_STATE_COUNT]; + if (!cuda_build_z4c_host_views(src->Bg, VarLists, Z4C_CUDA_STATE_COUNT, view_ptrs)) + return false; + if (z4c_cuda_resident_state_matches(src->Bg, view_ptrs) == 0) + return false; + } + return true; #elif USE_CUDA_BSSN if (bssn_cuda_has_resident_state(src->Bg) == 0) return false; @@ -380,12 +415,24 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg #endif } -bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) +bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList *VarListd = 0) { if (type < 1 || type > 3 || !dst || !dst->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) - return z4c_cuda_has_resident_state(dst->Bg) != 0; + if (type != 1 && !cuda_z4c_amr_unpack_device_enabled()) + return false; + if (z4c_cuda_has_resident_state(dst->Bg) == 0) + return false; + if (type != 1 && VarListd) + { + double *view_ptrs[Z4C_CUDA_STATE_COUNT]; + if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs)) + return false; + if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0) + return false; + } + return true; #elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; #else @@ -507,6 +554,28 @@ bool cuda_amr_prolong_device_enabled() return enabled != 0; } +bool cuda_z4c_amr_prolong_device_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_Z4C_AMR_PROLONG_DEVICE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool cuda_z4c_amr_unpack_device_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_Z4C_AMR_UNPACK_DEVICE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool cuda_amr_restrict_compare_enabled() { static int enabled = -1; @@ -1109,8 +1178,8 @@ int cuda_data_packer_device_batched(double *data, return -1; Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg; - if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) || - (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type))) + if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type, VarLists)) || + (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type, VarListd))) return -1; if (batch_block && (batch_block != block || batch_type != type)) @@ -1195,7 +1264,9 @@ bool cuda_segments_device_eligible(MyList *src, int rank_in, int dir, int myrank, - int state_count) + int state_count, + MyList *VarLists, + MyList *VarListd) { bool has_work = false; while (src && dst) @@ -1215,12 +1286,12 @@ bool cuda_segments_device_eligible(MyList *src, type = 3; if (dir == PACK) { - if (!cuda_can_direct_pack(src->data, dst->data, type)) + if (!cuda_can_direct_pack(src->data, dst->data, type, VarLists)) return false; } else { - if (!cuda_can_direct_unpack(dst->data, type)) + if (!cuda_can_direct_unpack(dst->data, type, VarListd)) return false; } } @@ -1318,11 +1389,13 @@ bool cuda_pack_to_device_eligible(MyList *src, MyList *dst, int rank_in, int state_count, - int myrank) + int myrank, + MyList *VarLists, + MyList *VarListd) { if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) return false; - if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count)) + if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count, VarLists, VarListd)) return false; return true; } @@ -1331,11 +1404,13 @@ bool cuda_recv_to_device_eligible(MyList *src, MyList *dst, int rank_in, int state_count, - int myrank) + int myrank, + MyList *VarLists, + MyList *VarListd) { if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) return false; - if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count)) + if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count, VarLists, VarListd)) return false; return true; } @@ -5133,7 +5208,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) + cuda_can_direct_pack(src->data, dst->data, type, VarLists)) { if (s_cuda_aware_pack_active) { handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry); @@ -5148,7 +5223,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) + cuda_can_direct_unpack(dst->data, type, VarListd)) { if (s_cuda_aware_pack_active) { handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd); @@ -5229,6 +5304,24 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && dst->data->Bg) { +#if USE_CUDA_Z4C && (ABEtype == 2) + const char *z4c_amr_env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE"); + if (state_count == Z4C_CUDA_STATE_COUNT && type != 1 && + z4c_amr_env && atoi(z4c_amr_env) != 0) + { + double *views[Z4C_CUDA_STATE_COUNT]; + if (cuda_build_z4c_host_views(dst->data->Bg, VarListd, state_count, views) && + z4c_cuda_resident_state_matches(dst->data->Bg, views) != 0) + { + if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) + { + cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + } + else +#endif if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) { cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; @@ -5906,8 +5999,8 @@ void Parallel::transfer_cached(MyList **src, MyList *PatL, MyList *VarList, int Symmetr { for (int n = 0; n < cpusize; n++) { - cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0; - cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0; + cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank, VarList, VarList) ? 1 : 0; + cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank, VarList, VarList) ? 1 : 0; } cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0; for (int n = 0; n < cpusize; n++) diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 7585cc2..897f8fd 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -679,7 +679,11 @@ void Z4c_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + { + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + } if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)) z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); @@ -778,7 +782,11 @@ void Z4c_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + { + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + } if (BH_num > 0 && lev == GH->levels - 1) { diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index e410cbf..ec25cca 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -462,25 +462,28 @@ struct StepContext { std::array d_state_curr; std::array d_state_next; std::array d_matter; + std::array resident_host; size_t cap_all; size_t cap_comm; bool h_comm_pinned; size_t cap_h_comm; bool matter_ready; bool state_ready; + bool resident_host_valid; StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), - matter_ready(false), state_ready(false) + matter_ready(false), state_ready(false), resident_host_valid(false) { d_state0.fill(nullptr); d_accum.fill(nullptr); d_state_curr.fill(nullptr); d_state_next.fill(nullptr); d_matter.fill(nullptr); + resident_host.fill(nullptr); } }; @@ -544,6 +547,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.d_state_curr.fill(nullptr); ctx.d_state_next.fill(nullptr); ctx.d_matter.fill(nullptr); + ctx.resident_host.fill(nullptr); + ctx.resident_host_valid = false; return alloc; } @@ -562,6 +567,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.cap_h_comm = alloc.cap_h_comm; ctx.matter_ready = false; ctx.state_ready = false; + ctx.resident_host.fill(nullptr); + ctx.resident_host_valid = false; } static void recycle_step_allocation(StepAllocation &alloc) @@ -5794,6 +5801,37 @@ static bool has_resident_state(void *block_tag) return it != g_step_ctx.end() && it->second.state_ready; } +static bool resident_key_usable(double **host_key) +{ + if (!host_key) return false; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (!host_key[i]) return false; + } + return true; +} + +static bool resident_key_matches(const StepContext &ctx, double **host_key) +{ + if (!ctx.state_ready || !ctx.resident_host_valid || !resident_key_usable(host_key)) + return false; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (ctx.resident_host[i] != host_key[i]) return false; + } + return true; +} + +static void set_resident_key(StepContext &ctx, double **host_key) +{ + if (!resident_key_usable(host_key)) { + ctx.resident_host.fill(nullptr); + ctx.resident_host_valid = false; + return; + } + for (int i = 0; i < BSSN_STATE_COUNT; ++i) + ctx.resident_host[i] = host_key[i]; + ctx.resident_host_valid = true; +} + #define pow2(x) ((x) * (x)) @@ -7786,10 +7824,13 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, bind_state_input_slots(ctx.d_state_curr); bind_state_output_slots(ctx.d_state_next); } - double t0 = profile ? cuda_profile_now_ms() : 0.0; if (!use_resident_state || !ctx.state_ready) { upload_state_inputs(state_host_in, all); + if (use_resident_state) { + ctx.state_ready = true; + set_resident_key(ctx, state_host_in); + } } if (apply_enforce_ga) { kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], @@ -7849,6 +7890,7 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem); ctx.d_state_curr.swap(ctx.d_state_next); ctx.state_ready = true; + set_resident_key(ctx, state_host_out); } else { download_state_outputs(state_host_out, all); } @@ -8154,6 +8196,17 @@ extern "C" int z4c_cuda_has_resident_state(void *block_tag) return has_resident_state(block_tag) ? 1 : 0; } +extern "C" int z4c_cuda_resident_state_matches(void *block_tag, + double **state_host_key) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return 0; + return resident_key_matches(it->second, state_host_key) ? 1 : 0; +} + extern "C" void z4c_cuda_release_step_ctx(void *block_tag) { using namespace z4c_cuda; diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index 0adea53..2f81925 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -142,6 +142,8 @@ int z4c_cuda_download_constraint_outputs(int *ex, double **constraint_host_out); int z4c_cuda_has_resident_state(void *block_tag); +int z4c_cuda_resident_state_matches(void *block_tag, + double **state_host_key); void z4c_cuda_release_step_ctx(void *block_tag);