From e4c10eca0f37653faf8f641a14cd2996f30112c2 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sun, 3 May 2026 16:05:47 +0800 Subject: [PATCH] Stabilize EScalar CUDA fallback path --- AMSS_NCKU_source/Parallel.C | 489 ++++++++++++++++-- AMSS_NCKU_source/bssnEScalar_class.C | 227 +++++++-- AMSS_NCKU_source/bssn_class.C | 146 ++++-- AMSS_NCKU_source/bssn_rhs_cuda.cu | 708 ++++++++++++++++++++++++++- AMSS_NCKU_source/bssn_rhs_cuda.h | 99 ++++ 5 files changed, 1542 insertions(+), 127 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 740640d..0c3041e 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -173,13 +173,24 @@ int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) return (src_vars || dst_vars) ? -1 : count; } +int cuda_var_list_count(MyList *vars) +{ + int count = 0; + while (vars) + { + ++count; + vars = vars->next; + } + return count; +} + #if USE_CUDA_BSSN bool cuda_build_bssn_host_views(Block *block, MyList *vars, int state_count, double **views) { - if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT) + if (!block || !block->fgfs || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT) return false; MyList *v = vars; for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) @@ -191,6 +202,37 @@ bool cuda_build_bssn_host_views(Block *block, } return v == 0; } + +bool cuda_build_escalar_host_views(Block *block, + MyList *vars, + double **views) +{ + if (!block || !block->fgfs || !vars || !views) + return false; + MyList *v = vars; + for (int i = 0; i < 2; ++i) + { + if (!v) + return false; + views[i] = block->fgfs[v->data->sgfn]; + if (!views[i]) + return false; + v = v->next; + } + return v == 0; +} + +bool cuda_escalar_list_looks_valid(MyList *vars) +{ + MyList *v = vars; + for (int i = 0; i < 2; ++i) + { + if (!v || !v->data) + return false; + v = v->next; + } + return v == 0; +} #endif #if USE_CUDA_Z4C && (ABEtype == 2) @@ -341,7 +383,7 @@ bool cuda_state_count_direct_supported(int state_count) #if USE_CUDA_Z4C && (ABEtype == 2) return state_count == Z4C_CUDA_STATE_COUNT; #elif USE_CUDA_BSSN - return state_count == BSSN_CUDA_STATE_COUNT; + return state_count == BSSN_CUDA_STATE_COUNT || state_count == 2; #else (void)state_count; return false; @@ -391,9 +433,42 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg } return true; #elif USE_CUDA_BSSN + if (VarLists) + { + int count = 0; + for (MyList *v = VarLists; v; v = v->next) ++count; + if (count == 2) + { + if (type != 1) + { + int a[3], b[3]; + if (type == 2) + { + if (!cuda_amr_restrict_device_enabled()) + return false; + if (!cuda_cell_gw3_restrict_params(src, dst, a)) + return false; + } + else if (type == 3) + { + if (!cuda_amr_prolong_device_enabled()) + return false; + if (!cuda_cell_gw3_prolong_params(src, dst, a, b)) + return false; + } + else + return false; + } + double *views[2]; + if (cuda_build_escalar_host_views(src->Bg, VarLists, views)) + return bssn_cuda_escalar_has_resident_fields(src->Bg, views[0], views[1]) != 0; + return cuda_escalar_list_looks_valid(VarLists) && + bssn_cuda_escalar_has_any_resident_fields(src->Bg) != 0; + } + } if (bssn_cuda_has_resident_state(src->Bg) == 0) return false; - if (VarLists) + if (VarLists && src->Bg->fgfs) { double *view_ptrs[BSSN_CUDA_STATE_COUNT]; if (!cuda_build_bssn_host_views(src->Bg, VarLists, BSSN_CUDA_STATE_COUNT, view_ptrs)) @@ -435,6 +510,21 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList (void)VarListd; return true; #elif USE_CUDA_BSSN + if (VarListd) + { + int count = 0; + for (MyList *v = VarListd; v; v = v->next) ++count; + if (count == 2) + { + double *views[2]; + if (!cuda_build_escalar_host_views(dst->Bg, VarListd, views)) + return cuda_escalar_list_looks_valid(VarListd) && + (type == 1 || type == 2 || type == 3); + if (bssn_cuda_escalar_has_resident_fields(dst->Bg, views[0], views[1]) != 0) + return true; + return type == 1 || type == 2 || type == 3; + } + } if (bssn_cuda_has_resident_state(dst->Bg) == 0) return false; if (VarListd) @@ -461,6 +551,26 @@ bool cuda_direct_pack_segment(double *buffer, if (state_count != Z4C_CUDA_STATE_COUNT) return false; #elif USE_CUDA_BSSN + if (state_count == 2) + { + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + double *views[2]; + double **key = 0; + if (cuda_build_escalar_host_views(src->Bg, VarLists, views)) + key = views; + else if (!cuda_escalar_list_looks_valid(VarLists)) + return false; + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + const bool ok = bssn_cuda_pack_escalar_batch_to_host_buffer( + src->Bg, key, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return ok; + } if (state_count != BSSN_CUDA_STATE_COUNT) return false; #else @@ -508,6 +618,26 @@ bool cuda_direct_unpack_segment(double *buffer, if (state_count != Z4C_CUDA_STATE_COUNT) return false; #elif USE_CUDA_BSSN + if (state_count == 2) + { + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + double *views[2]; + double **key = 0; + if (cuda_build_escalar_host_views(dst->Bg, VarListd, views)) + key = views; + else if (!cuda_escalar_list_looks_valid(VarListd)) + return false; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + const bool ok = bssn_cuda_unpack_escalar_batch_from_host_buffer( + dst->Bg, key, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; + } if (state_count != BSSN_CUDA_STATE_COUNT) return false; #else @@ -560,8 +690,10 @@ bool cuda_direct_pack_bssn_prefix_to_host(double *buffer, return false; double *views[BSSN_CUDA_STATE_COUNT]; double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; - if (!cuda_build_bssn_host_views(src->Bg, VarLists, BSSN_CUDA_STATE_COUNT, views) || - !cuda_build_state_soa(VarLists, BSSN_CUDA_STATE_COUNT, soa_flat)) + const bool have_views = + src->Bg->fgfs && + cuda_build_bssn_host_views(src->Bg, VarLists, BSSN_CUDA_STATE_COUNT, views); + if (!cuda_build_state_soa(VarLists, BSSN_CUDA_STATE_COUNT, soa_flat)) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; bool ok = false; @@ -570,33 +702,51 @@ bool cuda_direct_pack_bssn_prefix_to_host(double *buffer, const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); - ok = bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( - src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + ok = have_views + ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : bssn_cuda_pack_state_batch_to_host_buffer( + src->Bg, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; } else if (type == 2) { int first_fine[3]; if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) return false; - ok = bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views( - src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine[0], first_fine[1], first_fine[2], - soa_flat) == 0; + ok = have_views + ? bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + soa_flat) == 0 + : bssn_cuda_restrict_state_batch_to_host_buffer( + src->Bg, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + soa_flat) == 0; } else if (type == 3) { int first_fine_ii[3], coarse_lb[3]; if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) return false; - ok = bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views( - src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], - coarse_lb[0], coarse_lb[1], coarse_lb[2], - soa_flat) == 0; + ok = have_views + ? bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + soa_flat) == 0 + : bssn_cuda_prolong_state_batch_to_host_buffer( + src->Bg, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + soa_flat) == 0; } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; @@ -881,6 +1031,53 @@ bool cuda_direct_pack_segment_to_device(double *buffer, } #endif #if USE_CUDA_BSSN + if (state_count == 2) + { + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + bool ok = false; + double *views[2]; + double soa_flat[6]; + const bool have_views = cuda_build_escalar_host_views(src->Bg, VarLists, views); + const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); + if (!have_views) + return false; + if (type == 1) + { + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + ok = bssn_cuda_pack_escalar_batch_to_device_buffer( + src->Bg, views, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) + return false; + ok = bssn_cuda_restrict_escalar_batch_to_device_buffer( + src->Bg, views, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) + return false; + ok = bssn_cuda_prolong_escalar_batch_to_device_buffer( + src->Bg, views, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0; + } + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return ok; + } if (state_count != BSSN_CUDA_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; @@ -1073,6 +1270,23 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, } #endif #if USE_CUDA_BSSN + if (state_count == 2) + { + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + double *views[2]; + if (!cuda_build_escalar_host_views(dst->Bg, VarListd, views)) + return false; + const bool ok = bssn_cuda_unpack_escalar_batch_from_device_buffer( + dst->Bg, views, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; + } if (state_count != BSSN_CUDA_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; @@ -1127,8 +1341,15 @@ bool cuda_download_resident_subset_to_host(Block *block, } #endif #if USE_CUDA_BSSN - if (!block || state_count != BSSN_CUDA_STATE_COUNT) + if (!block || !block->fgfs || state_count != BSSN_CUDA_STATE_COUNT) + { + if (getenv("AMSS_CUDA_FALLBACK_DIAG")) + fprintf(stderr, + "[AMSS-CUDA-FALLBACK] invalid subset request block=%p fgfs=%p state_count=%d expected=%d\n", + (void *)block, block ? (void *)block->fgfs : 0, + state_count, BSSN_CUDA_STATE_COUNT); return false; + } if (bssn_cuda_has_resident_state(block) == 0) return true; int indices[BSSN_CUDA_STATE_COUNT]; @@ -1137,14 +1358,24 @@ bool cuda_download_resident_subset_to_host(Block *block, for (int i = 0; i < state_count; ++i) { if (!v) - return false; + return true; indices[i] = i; views[i] = block->fgfs[v->data->sgfn]; + if (!views[i]) + return true; v = v->next; } if (bssn_cuda_resident_state_matches(block, views) == 0) - return false; - return bssn_cuda_download_state_subset(block, block->shape, state_count, indices, views) == 0; + return true; + const int rc = bssn_cuda_download_state_subset(block, block->shape, state_count, indices, views); + if (rc != 0 && getenv("AMSS_CUDA_FALLBACK_DIAG")) + fprintf(stderr, + "[AMSS-CUDA-FALLBACK] subset download rc=%d block=%p lev=%d shape=[%d,%d,%d] first_var=%s sgfn=%d\n", + rc, (void *)block, block->lev, + block->shape[0], block->shape[1], block->shape[2], + (vars && vars->data) ? vars->data->name : "(null)", + (vars && vars->data) ? vars->data->sgfn : -1); + return rc == 0; #else (void)block; (void)vars; (void)state_count; return false; @@ -1197,7 +1428,8 @@ bool cuda_device_state_count_supported(int state_count) return true; #endif #if USE_CUDA_BSSN - return state_count == BSSN_CUDA_STATE_COUNT; + return state_count == BSSN_CUDA_STATE_COUNT || state_count == 2 || + state_count == BSSN_CUDA_STATE_COUNT + 2; #else (void)state_count; return false; @@ -1316,6 +1548,12 @@ int cuda_data_packer_device_batched(double *data, const int state_count = cuda_state_var_count(VarLists, VarListd); if (!cuda_device_state_count_supported(state_count)) return -1; +#if USE_CUDA_BSSN + if (state_count == 2) + return -1; + if (state_count > BSSN_CUDA_STATE_COUNT) + return -1; +#endif int size_out = 0; Block *batch_block = 0; int batch_type = 0; @@ -1620,6 +1858,109 @@ int data_packer_with_device_buffer(double *data, } #endif +#if USE_CUDA_BSSN || USE_CUDA_Z4C +std::vector &cuda_host_stage_buffer() +{ + static thread_local std::vector buffer; + return buffer; +} + +bool cuda_pack_one_host_field_to_device(double *device_buffer, + Parallel::gridseg *src, + Parallel::gridseg *dst, + int type, + var *src_var, + int Symmetry) +{ + if (!device_buffer || !src || !dst || !src->Bg || !dst->Bg || + !src->Bg->fgfs || !src_var || !src->Bg->fgfs[src_var->sgfn]) + return false; + const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; + if (region_all <= 0) + return false; + std::vector &stage = cuda_host_stage_buffer(); + stage.resize((size_t)region_all); + int DIM = dim; + + switch (type) + { + case 1: + f_copy(DIM, dst->llb, dst->uub, dst->shape, stage.data(), + src->Bg->bbox, src->Bg->bbox + dim, src->Bg->shape, + src->Bg->fgfs[src_var->sgfn], + dst->llb, dst->uub); + break; + case 2: + f_restrict3(DIM, dst->llb, dst->uub, dst->shape, stage.data(), + src->Bg->bbox, src->Bg->bbox + dim, src->Bg->shape, + src->Bg->fgfs[src_var->sgfn], + dst->llb, dst->uub, src_var->SoA, Symmetry); + break; + case 3: + f_prolong3(DIM, src->Bg->bbox, src->Bg->bbox + dim, src->Bg->shape, + src->Bg->fgfs[src_var->sgfn], + dst->llb, dst->uub, dst->shape, stage.data(), + dst->llb, dst->uub, src_var->SoA, Symmetry); + break; + default: + return false; + } + + cudaError_t err = cudaMemcpy(device_buffer, stage.data(), + (size_t)region_all * sizeof(double), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + fprintf(stderr, "Parallel: host tail cudaMemcpy H2D failed, err=%d\n", (int)err); + return false; + } + return true; +} + +bool cuda_unpack_one_device_field_to_host(double *device_buffer, + Parallel::gridseg *dst, + var *dst_var) +{ + if (!device_buffer || !dst || !dst->Bg || !dst->Bg->fgfs || + !dst_var || !dst->Bg->fgfs[dst_var->sgfn]) + return false; + const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; + if (region_all <= 0) + return false; + std::vector &stage = cuda_host_stage_buffer(); + stage.resize((size_t)region_all); + + cudaError_t err = cudaMemcpy(stage.data(), device_buffer, + (size_t)region_all * sizeof(double), + cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + fprintf(stderr, "Parallel: host tail cudaMemcpy D2H failed, err=%d\n", (int)err); + return false; + } + + int DIM = dim; + f_copy(DIM, + dst->Bg->bbox, dst->Bg->bbox + dim, dst->Bg->shape, + dst->Bg->fgfs[dst_var->sgfn], + dst->llb, dst->uub, dst->shape, stage.data(), + dst->llb, dst->uub); + return true; +} + +#if USE_CUDA_BSSN +void cuda_download_escalar_tail_if_present(Block *block, MyList *tail) +{ + if (!block || !block->fgfs || !tail || !tail->next) + return; + bssn_cuda_escalar_download_fields_if_present( + block, block->shape, + block->fgfs[tail->data->sgfn], + block->fgfs[tail->next->data->sgfn]); +} +#endif +#endif + } // namespace int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion @@ -5377,6 +5718,15 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && src->data->Bg) + { + cuda_download_escalar_tail_if_present(src->data->Bg, varls); + } +#endif bool handled_by_cuda = false; int cuda_handled_count = state_count; if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) && @@ -5409,10 +5759,32 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, + BSSN_CUDA_STATE_COUNT, type, VarLists, Symmetry)) + { + handled_by_cuda = true; + cuda_handled_count = BSSN_CUDA_STATE_COUNT; + } + else if (s_cuda_aware_pack_active && + state_idx == 0 && + state_count == BSSN_CUDA_STATE_COUNT + 2 && + dir == UNPACK && + cuda_direct_unpack_segment_from_device(data + size_out, dst->data, + BSSN_CUDA_STATE_COUNT, VarListd)) + { + handled_by_cuda = true; + cuda_handled_count = BSSN_CUDA_STATE_COUNT; + } +#endif #if USE_CUDA_BSSN else if (!s_cuda_aware_pack_active && state_idx == 0 && - state_count > BSSN_CUDA_STATE_COUNT && + state_count >= BSSN_CUDA_STATE_COUNT && dir == PACK && cuda_direct_pack_bssn_prefix_to_host(data + size_out, src->data, dst->data, type, VarLists, Symmetry)) @@ -5422,7 +5794,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

BSSN_CUDA_STATE_COUNT && + state_count >= BSSN_CUDA_STATE_COUNT && dir == UNPACK && cuda_direct_unpack_bssn_prefix_from_host(data + size_out, dst->data, type, VarListd)) @@ -5430,6 +5802,27 @@ int Parallel::data_packer(double *data, MyList *src, MyList

= BSSN_CUDA_STATE_COUNT && + dir == PACK && + cuda_pack_one_host_field_to_device(data + size_out, src->data, dst->data, + type, varls->data, Symmetry)) + { + handled_by_cuda = true; + cuda_handled_count = 1; + } + else if (s_cuda_aware_pack_active && + state_count == BSSN_CUDA_STATE_COUNT + 2 && + state_idx >= BSSN_CUDA_STATE_COUNT && + dir == UNPACK && + cuda_unpack_one_device_field_to_host(data + size_out, dst->data, varld->data)) + { + handled_by_cuda = true; + cuda_handled_count = 1; + } #endif if (!handled_by_cuda) { @@ -5443,7 +5836,8 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && src->data->Bg) { if (!cuda_download_resident_subset_to_host(src->data->Bg, VarLists, state_count)) @@ -5496,7 +5890,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub); #if USE_CUDA_BSSN || USE_CUDA_Z4C - if (cuda_state_count_direct_supported(state_count) && + if (state_count == BSSN_CUDA_STATE_COUNT && dst->data && dst->data->Bg) { #if USE_CUDA_Z4C && (ABEtype == 2) @@ -7395,11 +7789,27 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN + const int state_count = cuda_state_var_count(VarList1, VarList2); + if (state_count == 2) + { + double *src1_scalar[2], *src2_scalar[2], *dst_scalar[2]; + if (cuda_build_escalar_host_views(cg, VarList1, src1_scalar) && + cuda_build_escalar_host_views(cg, VarList2, src2_scalar) && + cuda_build_escalar_host_views(cg, VarList3, dst_scalar) && + bssn_cuda_prepare_escalar_inter_time_level(cg, cg->shape, + src1_scalar, src2_scalar, 0, dst_scalar, + 2, tindex) == 0) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } + } bool bssn_prefix_done = false; double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[BSSN_CUDA_STATE_COUNT]; - const int state_count = cuda_state_var_count(VarList1, VarList2); if (state_count >= BSSN_CUDA_STATE_COUNT && cuda_build_bssn_host_views(cg, VarList1, BSSN_CUDA_STATE_COUNT, src1_views) && cuda_build_bssn_host_views(cg, VarList2, BSSN_CUDA_STATE_COUNT, src2_views) && @@ -7488,12 +7898,29 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN + const int state_count = cuda_state_var_count(VarList1, VarList2); + if (state_count == 2) + { + double *src1_scalar[2], *src2_scalar[2], *src3_scalar[2], *dst_scalar[2]; + if (cuda_build_escalar_host_views(cg, VarList1, src1_scalar) && + cuda_build_escalar_host_views(cg, VarList2, src2_scalar) && + cuda_build_escalar_host_views(cg, VarList3, src3_scalar) && + cuda_build_escalar_host_views(cg, VarList4, dst_scalar) && + bssn_cuda_prepare_escalar_inter_time_level(cg, cg->shape, + src1_scalar, src2_scalar, src3_scalar, dst_scalar, + 3, tindex) == 0) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } + } bool bssn_prefix_done = false; double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[BSSN_CUDA_STATE_COUNT]; double *src3_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[BSSN_CUDA_STATE_COUNT]; - const int state_count = cuda_state_var_count(VarList1, VarList2); if (state_count >= BSSN_CUDA_STATE_COUNT && cuda_build_bssn_host_views(cg, VarList1, BSSN_CUDA_STATE_COUNT, src1_views) && cuda_build_bssn_host_views(cg, VarList2, BSSN_CUDA_STATE_COUNT, src2_views) && diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index e1cf9d0..b0c422c 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -140,6 +140,88 @@ bool escalar_gpu_rk_enabled() return enabled != 0; } +bool escalar_resident_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_RESIDENT"); + const char *experimental = getenv("AMSS_ESCALAR_RESIDENT_EXPERIMENTAL"); + enabled = (env && atoi(env) != 0 && + experimental && atoi(experimental) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool escalar_step_profile_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +int escalar_step_profile_every() +{ + static int every = -1; + if (every < 0) + { + const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 1; + } + return every; +} + +struct EScalarStepProfile +{ + double start; + double predictor_rhs; + double predictor_sync; + double analysis; + double corrector_rhs; + double corrector_sync; + double restrict_prolong; + double other_sync; +}; + +void escalar_profile_init(EScalarStepProfile &p) +{ + p.start = MPI_Wtime(); + p.predictor_rhs = 0.0; + p.predictor_sync = 0.0; + p.analysis = 0.0; + p.corrector_rhs = 0.0; + p.corrector_sync = 0.0; + p.restrict_prolong = 0.0; + p.other_sync = 0.0; +} + +void escalar_profile_add(double &bucket, double t0) +{ + bucket += MPI_Wtime() - t0; +} + +void escalar_profile_report(const EScalarStepProfile &p, int lev, int myrank) +{ + if (myrank != 0 || !escalar_step_profile_enabled()) + return; + static long long call_count = 0; + ++call_count; + const int every = escalar_step_profile_every(); + if (every > 1 && (call_count % every) != 0) + return; + const double total = MPI_Wtime() - p.start; + fprintf(stderr, + "[AMSS-ESCALAR-PROFILE] call=%lld lev=%d total=%.6f pred_rhs=%.6f pred_sync=%.6f analysis=%.6f corr_rhs=%.6f corr_sync=%.6f rp=%.6f other_sync=%.6f\n", + call_count, lev, total, p.predictor_rhs, p.predictor_sync, + p.analysis, p.corrector_rhs, p.corrector_sync, + p.restrict_prolong, p.other_sync); + fflush(stderr); +} + void clear_var_list(MyList *&list) { if (list) @@ -173,6 +255,34 @@ void download_bssn_cuda_prefix_if_present(MyList *PatL, } } +void download_escalar_cuda_pair_if_present(MyList *PatL, + var *Sphi_var, + var *Spi_var, + int myrank) +{ + if (!Sphi_var || !Spi_var) + return; + while (PatL) + { + MyList *BP = PatL->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + bssn_cuda_escalar_download_fields_if_present( + cg, cg->shape, + cg->fgfs[Sphi_var->sgfn], + cg->fgfs[Spi_var->sgfn]); + } + if (BP == PatL->data->ble) + break; + BP = BP->next; + } + PatL = PatL->next; + } +} + int run_bssn_escalar_cuda_substep(Block *cg, MyList *state_in_list, MyList *state_out_list, @@ -992,8 +1102,8 @@ void bssnEScalar_class::Read_Pablo() //================================================================================================ -void bssnEScalar_class::Step(int lev, int YN) -{ +void bssnEScalar_class::Step(int lev, int YN) +{ double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); #ifdef With_AHF AH_Step_Find(lev, dT_lev); @@ -1003,15 +1113,18 @@ void bssnEScalar_class::Step(int lev, int YN) if (lev < GH->movls) ndeps = numepsb; double TRK4 = PhysTime; - int iter_count = 0; // count RK4 substeps - int pre = 0, cor = 1; - int ERROR = 0; - - MyList *sPp; - // Predictor - MyList *Pp = GH->PatL[lev]; - while (Pp) - { + int iter_count = 0; // count RK4 substeps + int pre = 0, cor = 1; + int ERROR = 0; + EScalarStepProfile escalar_profile; + escalar_profile_init(escalar_profile); + + MyList *sPp; + // Predictor + const double escalar_profile_predictor_rhs_start = MPI_Wtime(); + MyList *Pp = GH->PatL[lev]; + while (Pp) + { MyList *BP = Pp->data->blb; while (BP) { @@ -1101,6 +1214,8 @@ void bssnEScalar_class::Step(int lev, int YN) { if (scalar_gpu_rk_done) { + if (!escalar_resident_enabled()) + { #ifndef WithShell if (lev > 0) // fix BD point #endif @@ -1112,6 +1227,7 @@ void bssnEScalar_class::Step(int lev, int YN) cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], varl0->data->SoA, Symmetry, cor); + } varl0 = varl0->next; varl = varl->next; @@ -1157,11 +1273,12 @@ void bssnEScalar_class::Step(int lev, int YN) if (BP == Pp->data->ble) break; BP = BP->next; - } - Pp = Pp->next; - } - // check error information - { + } + Pp = Pp->next; + } + escalar_profile_add(escalar_profile.predictor_rhs, escalar_profile_predictor_rhs_start); + // check error information + { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } @@ -1325,10 +1442,14 @@ void bssnEScalar_class::Step(int lev, int YN) #endif #if USE_CUDA_BSSN + const double escalar_profile_predictor_sync_start = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_pre, Symmetry, sync_cache_pre[lev]); Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_pre, Symmetry, sync_cache_scalar_pre[lev]); + escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start); #else + const double escalar_profile_predictor_sync_start = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start); #endif #ifdef WithShell @@ -1381,21 +1502,28 @@ void bssnEScalar_class::Step(int lev, int YN) } } } - // data analysis part - // Warning NOTE: the variables1 are used as temp storege room - if (lev == a_lev) - { - AnalysisStuff_EScalar(lev, dT_lev); - } - // corrector - for (iter_count = 1; iter_count < 4; iter_count++) - { + // data analysis part + // Warning NOTE: the variables1 are used as temp storege room + if (lev == a_lev) + { + const double escalar_profile_analysis_start = MPI_Wtime(); +#if USE_CUDA_BSSN + if (escalar_resident_enabled()) + download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi, Spi, myrank); +#endif + AnalysisStuff_EScalar(lev, dT_lev); + escalar_profile_add(escalar_profile.analysis, escalar_profile_analysis_start); + } + // corrector + for (iter_count = 1; iter_count < 4; iter_count++) + { // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; - if (iter_count == 1 || iter_count == 3) - TRK4 += dT_lev / 2; - Pp = GH->PatL[lev]; - while (Pp) - { + if (iter_count == 1 || iter_count == 3) + TRK4 += dT_lev / 2; + const double escalar_profile_corrector_rhs_start = MPI_Wtime(); + Pp = GH->PatL[lev]; + while (Pp) + { MyList *BP = Pp->data->blb; while (BP) { @@ -1494,6 +1622,8 @@ void bssnEScalar_class::Step(int lev, int YN) { if (scalar_gpu_rk_done) { + if (!escalar_resident_enabled()) + { #ifndef WithShell if (lev > 0) // fix BD point #endif @@ -1505,6 +1635,7 @@ void bssnEScalar_class::Step(int lev, int YN) cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], varl0->data->SoA, Symmetry, cor); + } varl0 = varl0->next; varl = varl->next; @@ -1552,11 +1683,12 @@ void bssnEScalar_class::Step(int lev, int YN) if (BP == Pp->data->ble) break; BP = BP->next; - } - Pp = Pp->next; - } - - // check error information + } + Pp = Pp->next; + } + escalar_profile_add(escalar_profile.corrector_rhs, escalar_profile_corrector_rhs_start); + + // check error information { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); @@ -1731,10 +1863,14 @@ void bssnEScalar_class::Step(int lev, int YN) #endif #if USE_CUDA_BSSN + const double escalar_profile_corrector_sync_start = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_cor, Symmetry, sync_cache_cor[lev]); Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_cor, Symmetry, sync_cache_scalar_cor[lev]); + escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start); #else + const double escalar_profile_corrector_sync_start = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start); #endif #ifdef WithShell @@ -1837,17 +1973,21 @@ void bssnEScalar_class::Step(int lev, int YN) #if (RPS == 0) // mesh refinement boundary part + const double escalar_profile_rp_start = MPI_Wtime(); #if USE_CUDA_BSSN { const char *mixed_env = getenv("AMSS_ESCALAR_MIXED_GPU_RP"); const bool mixed_gpu_rp = (mixed_env && atoi(mixed_env) != 0); const char *split_env = getenv("AMSS_ESCALAR_SPLIT_RP"); const bool split_rp = (split_env && atoi(split_env) != 0); + if (escalar_resident_enabled() && !split_rp) + download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi1, Spi1, myrank); if (!mixed_gpu_rp && !split_rp) download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank); } #endif RestrictProlong(lev, YN, BB); + escalar_profile_add(escalar_profile.restrict_prolong, escalar_profile_rp_start); #ifdef WithShell if (lev == 0) @@ -1910,18 +2050,19 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { + if (BH_num > 0 && lev == GH->levels - 1) + { for (int ithBH = 0; ithBH < BH_num; ithBH++) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -} - -//================================================================================================ + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } + escalar_profile_report(escalar_profile, lev, myrank); +} + +//================================================================================================ diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index bed3fad..c028280 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -740,6 +740,38 @@ void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList } } +void bssn_cuda_download_level_scalar_tail_if_present(MyList *PatL, + MyList *vars, + int myrank) +{ + MyList *tail = vars; + for (int i = 0; i < BSSN_CUDA_STATE_COUNT && tail; ++i) + tail = tail->next; + if (!tail || !tail->next || tail->next->next) + return; + + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && cg->fgfs) + { + bssn_cuda_escalar_download_fields_if_present( + cg, cg->shape, + cg->fgfs[tail->data->sgfn], + cg->fgfs[tail->next->data->sgfn]); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + void bssn_cuda_release_level_state(MyList *PatL, int myrank) { MyList *Pp = PatL; @@ -770,9 +802,30 @@ void bssn_cuda_flush_level_before_regrid(MyList *PatL, bssn_cuda_download_level_state_if_present(PatL, oldL, myrank); bssn_cuda_download_level_state_if_present(PatL, stateL, myrank); bssn_cuda_download_level_state_if_present(PatL, preL, myrank); + bssn_cuda_download_level_scalar_tail_if_present(PatL, corL, myrank); + bssn_cuda_download_level_scalar_tail_if_present(PatL, oldL, myrank); + bssn_cuda_download_level_scalar_tail_if_present(PatL, stateL, myrank); + bssn_cuda_download_level_scalar_tail_if_present(PatL, preL, myrank); bssn_cuda_release_level_state(PatL, myrank); } +void bssn_cuda_flush_all_levels_before_regrid(cgh *GH, + MyList *corL, + MyList *oldL, + MyList *stateL, + MyList *preL, + int myrank) +{ + if (!GH) + return; + for (int il = 0; il < GH->levels; ++il) + { + bssn_cuda_flush_level_before_regrid(GH->PatL[il], + corL, oldL, stateL, preL, + myrank); + } +} + #if USE_CUDA_Z4C && (ABEtype == 2) bool fill_z4c_cuda_views_for_regrid(Block *cg, MyList *vars, double **host_views) @@ -3234,12 +3287,27 @@ void bssn_class::Evolve(int Steps) #if (REGLEV == 1) STEP_TIMER_DECL(timer_regrid); #if USE_CUDA_BSSN && (ABEtype != 2) - for (int il = 0; il < GH->levels; il++) - if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[il], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); + if (amss_escalar_mixed_gpu_rp_enabled()) + { + bool any_cuda_regrid_flush = false; + for (int il = 0; il < GH->levels; il++) + if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) + any_cuda_regrid_flush = true; + if (any_cuda_regrid_flush) + bssn_cuda_flush_all_levels_before_regrid(GH, + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); + } + else + { + for (int il = 0; il < GH->levels; il++) + if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0)) + bssn_cuda_flush_level_before_regrid(GH->PatL[il], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); + } #endif #if USE_CUDA_Z4C && USE_CUDA_BSSN && (ABEtype == 2) for (int il = 0; il < GH->levels; il++) @@ -3491,10 +3559,18 @@ void bssn_class::RecursiveStep(int lev) STEP_TIMER_DECL(timer_regrid_onelevel); #if USE_CUDA_BSSN if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[lev], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); + { + if (amss_escalar_mixed_gpu_rp_enabled()) + bssn_cuda_flush_all_levels_before_regrid(GH, + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); + else + bssn_cuda_flush_level_before_regrid(GH->PatL[lev], + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); + } #endif if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, @@ -3684,10 +3760,10 @@ void bssn_class::ParallelStep() #if (REGLEV == 0) #if USE_CUDA_BSSN if (bssn_cuda_should_flush_before_regrid(GH, GH->mylev, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[GH->mylev], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); + bssn_cuda_flush_all_levels_before_regrid(GH, + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); #endif if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, @@ -3817,6 +3893,20 @@ void bssn_class::ParallelStep() // Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT_lev); +#if USE_CUDA_BSSN && (ABEtype != 2) + const bool cuda_recursive_regrid_needs_full_flush = + bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0) || + (lev < GH->levels - 1 && + bssn_cuda_should_flush_before_regrid(GH, lev + 1, Symmetry, BH_num, Porg0)) || + (lev - 1 >= GH->movls && + bssn_cuda_should_flush_before_regrid(GH, lev - 1, Symmetry, BH_num, Porg0)); + if (cuda_recursive_regrid_needs_full_flush) + bssn_cuda_flush_all_levels_before_regrid(GH, + SynchList_cor, OldStateList, + StateList, SynchList_pre, + myrank); +#endif + { MPI_Status status; // receive @@ -3860,13 +3950,6 @@ void bssn_class::ParallelStep() if (lev + 1 >= GH->movls) { // GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0, -#if USE_CUDA_BSSN - if (bssn_cuda_should_flush_before_regrid(GH, lev + 1, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[lev + 1], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); -#endif if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) @@ -3886,13 +3969,6 @@ void bssn_class::ParallelStep() // for this level if (YN == 1) { -#if USE_CUDA_BSSN - if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[lev], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); -#endif if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -3916,13 +3992,6 @@ void bssn_class::ParallelStep() if (YN == 1) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, -#if USE_CUDA_BSSN - if (bssn_cuda_should_flush_before_regrid(GH, lev - 1, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[lev - 1], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); -#endif if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) @@ -3943,13 +4012,6 @@ void bssn_class::ParallelStep() if (i % 4 == 3) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, -#if USE_CUDA_BSSN - if (bssn_cuda_should_flush_before_regrid(GH, lev - 1, Symmetry, BH_num, Porg0)) - bssn_cuda_flush_level_before_regrid(GH->PatL[lev - 1], - SynchList_cor, OldStateList, - StateList, SynchList_pre, - myrank); -#endif if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 0c7b5e0..1f009d1 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -217,6 +217,17 @@ static bool escalar_gpu_rk_enabled() { return enabled != 0; } +static bool escalar_resident_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_ESCALAR_RESIDENT"); + const char *experimental = getenv("AMSS_ESCALAR_RESIDENT_EXPERIMENTAL"); + enabled = (env && atoi(env) != 0 && + experimental && atoi(experimental) != 0) ? 1 : 0; + } + return enabled != 0; +} + static void try_pin_escalar_host_buffer(void *ptr, size_t bytes) { if (!ptr || bytes == 0 || !escalar_host_pin_enabled()) return; @@ -523,6 +534,8 @@ static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; static constexpr int BSSN_RESIDENT_BANK_COUNT = 4; +static constexpr int ESCALAR_FIELD_COUNT = 2; +static constexpr int ESCALAR_RESIDENT_BANK_COUNT = 4; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -599,6 +612,7 @@ struct StepContext { double *d_accum_mem; double *d_escalar0_mem; double *d_escalar_accum_mem; + std::array d_escalar_resident_mem; double *d_state_curr_mem; double *d_state_next_mem; std::array d_resident_mem; @@ -609,6 +623,11 @@ struct StepContext { std::array d_accum; std::array d_escalar0; std::array d_escalar_accum; + std::array, ESCALAR_RESIDENT_BANK_COUNT> d_escalar_resident; + std::array, ESCALAR_RESIDENT_BANK_COUNT> escalar_host; + std::array escalar_valid; + std::array escalar_host_clean; + std::array escalar_age; std::array d_state_curr; std::array d_state_next; std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; @@ -624,23 +643,35 @@ struct StepContext { bool matter_ready; bool state_ready; int current_bank; + int current_escalar_bank; unsigned long long resident_clock; + unsigned long long escalar_clock; StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr), + d_escalar_resident_mem{}, d_state_curr_mem(nullptr), d_state_next_mem(nullptr), d_resident_mem{}, d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), matter_ready(false), state_ready(false), - current_bank(-1), resident_clock(0) + current_bank(-1), current_escalar_bank(-1), + resident_clock(0), escalar_clock(0) { + d_escalar_resident_mem.fill(nullptr); d_resident_mem.fill(nullptr); d_state0.fill(nullptr); d_accum.fill(nullptr); d_escalar0.fill(nullptr); d_escalar_accum.fill(nullptr); + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + d_escalar_resident[b].fill(nullptr); + escalar_host[b].fill(nullptr); + } + escalar_valid.fill(false); + escalar_host_clean.fill(false); + escalar_age.fill(0); d_state_curr.fill(nullptr); d_state_next.fill(nullptr); for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -659,6 +690,7 @@ struct StepAllocation { double *d_accum_mem; double *d_escalar0_mem; double *d_escalar_accum_mem; + std::array d_escalar_resident_mem; std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; @@ -681,6 +713,7 @@ static StepAllocation empty_step_allocation() alloc.d_accum_mem = nullptr; alloc.d_escalar0_mem = nullptr; alloc.d_escalar_accum_mem = nullptr; + alloc.d_escalar_resident_mem.fill(nullptr); alloc.d_resident_mem.fill(nullptr); alloc.d_matter_mem = nullptr; alloc.d_comm_mem = nullptr; @@ -704,6 +737,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx) alloc.d_accum_mem = ctx.d_accum_mem; alloc.d_escalar0_mem = ctx.d_escalar0_mem; alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem; + alloc.d_escalar_resident_mem = ctx.d_escalar_resident_mem; alloc.d_resident_mem = ctx.d_resident_mem; alloc.d_matter_mem = ctx.d_matter_mem; alloc.d_comm_mem = ctx.d_comm_mem; @@ -716,6 +750,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.d_accum_mem = nullptr; ctx.d_escalar0_mem = nullptr; ctx.d_escalar_accum_mem = nullptr; + ctx.d_escalar_resident_mem.fill(nullptr); ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; ctx.d_resident_mem.fill(nullptr); @@ -729,11 +764,20 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.matter_ready = false; ctx.state_ready = false; ctx.current_bank = -1; + ctx.current_escalar_bank = -1; ctx.resident_clock = 0; + ctx.escalar_clock = 0; ctx.d_state0.fill(nullptr); ctx.d_accum.fill(nullptr); ctx.d_escalar0.fill(nullptr); ctx.d_escalar_accum.fill(nullptr); + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + ctx.d_escalar_resident[b].fill(nullptr); + ctx.escalar_host[b].fill(nullptr); + } + ctx.escalar_valid.fill(false); + ctx.escalar_host_clean.fill(false); + ctx.escalar_age.fill(0); ctx.d_state_curr.fill(nullptr); ctx.d_state_next.fill(nullptr); for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -753,6 +797,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.d_accum_mem = alloc.d_accum_mem; ctx.d_escalar0_mem = alloc.d_escalar0_mem; ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem; + ctx.d_escalar_resident_mem = alloc.d_escalar_resident_mem; ctx.d_resident_mem = alloc.d_resident_mem; ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; @@ -766,11 +811,19 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.matter_ready = false; ctx.state_ready = false; ctx.current_bank = -1; + ctx.current_escalar_bank = -1; ctx.resident_clock = 0; + ctx.escalar_clock = 0; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { ctx.resident_host[b].fill(nullptr); ctx.resident_host_clean[b].fill(0); } + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + ctx.escalar_host[b].fill(nullptr); + ctx.escalar_valid[b] = false; + ctx.escalar_host_clean[b] = false; + ctx.escalar_age[b] = 0; + } ctx.resident_age.fill(0); ctx.resident_valid.fill(false); } @@ -883,6 +936,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all; } } + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (ctx.d_escalar_resident_mem[b]) { + for (int i = 0; i < ESCALAR_FIELD_COUNT; ++i) + ctx.d_escalar_resident[b][i] = ctx.d_escalar_resident_mem[b] + (size_t)i * all; + } + } if (ctx.current_bank >= 0) { ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank]; ctx.d_state_curr = ctx.d_resident[ctx.current_bank]; @@ -899,10 +958,19 @@ static void ensure_escalar_buffers(StepContext &ctx, size_t all) CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double))); if (!ctx.d_escalar_accum_mem) CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double))); + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (!ctx.d_escalar_resident_mem[b]) + CUDA_CHECK(cudaMalloc(&ctx.d_escalar_resident_mem[b], + ESCALAR_FIELD_COUNT * ctx.cap_all * sizeof(double))); + } for (int i = 0; i < 2; ++i) { ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all; ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all; } + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + for (int i = 0; i < ESCALAR_FIELD_COUNT; ++i) + ctx.d_escalar_resident[b][i] = ctx.d_escalar_resident_mem[b] + (size_t)i * all; + } } static void release_step_ctx(void *block_tag) @@ -2744,6 +2812,28 @@ static void gpu_copy_patch_boundary_batch(int all, touch_zmin, touch_zmax); } +static void gpu_copy_escalar_patch_boundary(int all, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax)) + return; + + PatchBoundaryTables tables = {}; + tables.src_fields[0] = g_buf.slot[S_S_arr]; + tables.src_fields[1] = g_buf.slot[S_f_arr]; + tables.dst_fields[0] = g_buf.slot[S_Gamxa]; + tables.dst_fields[1] = g_buf.slot[S_Gamya]; + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)ESCALAR_FIELD_COUNT); + kern_copy_patch_boundary_batched<<>>( + tables, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); +} + __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, double * __restrict__ gxy, double * __restrict__ gxz, @@ -5224,6 +5314,143 @@ static bool any_resident_bank_valid(const StepContext &ctx) return false; } +static bool escalar_key_usable(double **host_key) +{ + return host_key && host_key[0] && host_key[1]; +} + +static bool escalar_key_matches(const StepContext &ctx, int bank, double **host_key) +{ + if (!escalar_key_usable(host_key) || + bank < 0 || bank >= ESCALAR_RESIDENT_BANK_COUNT) + return false; + return ctx.escalar_host[bank][0] == host_key[0] && + ctx.escalar_host[bank][1] == host_key[1]; +} + +static int find_escalar_bank(const StepContext &ctx, double **host_key) +{ + if (!escalar_key_usable(host_key)) return -1; + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (escalar_key_matches(ctx, b, host_key)) + return b; + } + return -1; +} + +static void mark_escalar_current_bank(StepContext &ctx, int bank) +{ + if (bank >= 0 && bank < ESCALAR_RESIDENT_BANK_COUNT) + ctx.current_escalar_bank = bank; +} + +static int choose_escalar_bank_for_reuse(StepContext &ctx, int avoid_bank) +{ + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (b != avoid_bank && !ctx.escalar_valid[b]) + return b; + } + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (b == avoid_bank) continue; + if (best < 0 || ctx.escalar_age[b] < best_age) { + best = b; + best_age = ctx.escalar_age[b]; + } + } + if (best < 0) best = 0; + ctx.escalar_valid[best] = false; + ctx.escalar_host_clean[best] = false; + ctx.escalar_host[best].fill(nullptr); + ctx.escalar_age[best] = 0; + return best; +} + +static void assign_escalar_key(StepContext &ctx, int bank, double **host_key) +{ + ctx.escalar_host[bank][0] = host_key[0]; + ctx.escalar_host[bank][1] = host_key[1]; + ctx.escalar_host_clean[bank] = false; + ctx.escalar_age[bank] = ++ctx.escalar_clock; +} + +static int ensure_escalar_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_bank = -1) +{ + if (!escalar_key_usable(host_key)) + return -1; + ensure_escalar_buffers(ctx, all); + int bank = find_escalar_bank(ctx, host_key); + if (bank < 0) { + bank = choose_escalar_bank_for_reuse(ctx, avoid_bank); + assign_escalar_key(ctx, bank, host_key); + } + ctx.escalar_age[bank] = ++ctx.escalar_clock; + if (!ctx.escalar_valid[bank] && upload_if_missing) { + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar_resident[bank][0], host_key[0], + bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar_resident[bank][1], host_key[1], + bytes, cudaMemcpyHostToDevice)); + ctx.escalar_valid[bank] = true; + ctx.escalar_host_clean[bank] = true; + } + return bank; +} + +static int reserve_escalar_output_bank(StepContext &ctx, + double **host_key, + size_t all, + int input_bank) +{ + if (!escalar_key_usable(host_key)) + return -1; + ensure_escalar_buffers(ctx, all); + if (escalar_key_matches(ctx, input_bank, host_key)) + return input_bank; + int bank = find_escalar_bank(ctx, host_key); + if (bank < 0) + bank = choose_escalar_bank_for_reuse(ctx, input_bank); + assign_escalar_key(ctx, bank, host_key); + ctx.escalar_valid[bank] = false; + return bank; +} + +static void mark_escalar_output_valid(StepContext &ctx, int bank) +{ + if (bank < 0 || bank >= ESCALAR_RESIDENT_BANK_COUNT) return; + ctx.escalar_valid[bank] = true; + ctx.escalar_host_clean[bank] = false; + ctx.escalar_age[bank] = ++ctx.escalar_clock; + mark_escalar_current_bank(ctx, bank); +} + +static int active_or_keyed_escalar_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing) +{ + if (escalar_key_usable(host_key)) { + int bank = ensure_escalar_bank(ctx, host_key, all, upload_if_missing); + mark_escalar_current_bank(ctx, bank); + return bank; + } + if (ctx.current_escalar_bank >= 0 && + ctx.escalar_valid[ctx.current_escalar_bank]) + return ctx.current_escalar_bank; + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (ctx.escalar_valid[b]) { + mark_escalar_current_bank(ctx, b); + return b; + } + } + return -1; +} + static void update_state_ready(StepContext &ctx) { ctx.state_ready = any_resident_bank_valid(ctx); @@ -7105,12 +7332,23 @@ int bssn_cuda_compute_escalar_matter(void *block_tag, g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); set_resident_host_clean(ctx, input_bank, false); } - try_pin_escalar_host_buffer(Sphi_host, bytes); - try_pin_escalar_host_buffer(Spi_host, bytes); - try_pin_escalar_host_buffer(Sphi_rhs_host, bytes); - try_pin_escalar_host_buffer(Spi_rhs_host, bytes); - CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice)); + double *scalar_in_key[ESCALAR_FIELD_COUNT] = { Sphi_host, Spi_host }; + const bool use_escalar_resident = escalar_resident_enabled() && escalar_gpu_rk_enabled(); + if (use_escalar_resident) { + const int scalar_bank = ensure_escalar_bank(ctx, scalar_in_key, all, true); + if (scalar_bank < 0) return 1; + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], ctx.d_escalar_resident[scalar_bank][0], + bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], ctx.d_escalar_resident[scalar_bank][1], + bytes, cudaMemcpyDeviceToDevice)); + } else { + try_pin_escalar_host_buffer(Sphi_host, bytes); + try_pin_escalar_host_buffer(Spi_host, bytes); + try_pin_escalar_host_buffer(Sphi_rhs_host, bytes); + try_pin_escalar_host_buffer(Spi_rhs_host, bytes); + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice)); + } double *src_fields[3] = { g_buf.slot[S_chi], g_buf.slot[S_Lap], g_buf.slot[S_S_arr] @@ -7195,7 +7433,16 @@ int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag, const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); + int touch_xmin = 0, touch_xmax = 0; + int touch_ymin = 0, touch_ymax = 0; + int touch_zmin = 0, touch_zmax = 0; setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor); + if (Lev > 0) { + compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } StepContext &ctx = ensure_step_ctx(block_tag, all); ensure_escalar_buffers(ctx, all); @@ -7221,11 +7468,29 @@ int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag, ctx.d_escalar_accum[0], dT, RK4); kern_rk4_finalize<<>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya], ctx.d_escalar_accum[1], dT, RK4); + if (Lev > 0) { + gpu_copy_escalar_patch_boundary((int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } - try_pin_escalar_host_buffer(Sphi_out_host, bytes); - try_pin_escalar_host_buffer(Spi_out_host, bytes); - CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost)); + if (escalar_resident_enabled()) { + double *scalar_out_key[ESCALAR_FIELD_COUNT] = { Sphi_out_host, Spi_out_host }; + const int input_bank = find_escalar_bank(ctx, scalar_out_key); + const int out_bank = reserve_escalar_output_bank(ctx, scalar_out_key, all, input_bank); + if (out_bank < 0) return 1; + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar_resident[out_bank][0], g_buf.slot[S_Gamxa], + bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar_resident[out_bank][1], g_buf.slot[S_Gamya], + bytes, cudaMemcpyDeviceToDevice)); + mark_escalar_output_valid(ctx, out_bank); + } else { + try_pin_escalar_host_buffer(Sphi_out_host, bytes); + try_pin_escalar_host_buffer(Spi_out_host, bytes); + CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost)); + } CUDA_CHECK(cudaDeviceSynchronize()); (void)Lev; return 0; @@ -7743,6 +8008,20 @@ int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag return 0; } +extern "C" +int bssn_cuda_restrict_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa) +{ + return bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views( + block_tag, nullptr, state_count, host_buffer, ex, + sx, sy, sz, fi0, fj0, fk0, state_soa); +} + extern "C" int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, double **state_host_key, @@ -7780,6 +8059,21 @@ int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, return 0; } +extern "C" +int bssn_cuda_prolong_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) +{ + return bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views( + block_tag, nullptr, state_count, host_buffer, ex, + sx, sy, sz, ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, state_soa); +} + static void copy_state_device_batch(void *block_tag, int state_count, double *device_buffer, @@ -8249,6 +8543,335 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta return 0; } +static int escalar_bank_for_key(void *block_tag, + double **scalar_host_key, + int *ex, + bool upload_if_missing) +{ + if (!escalar_key_usable(scalar_host_key)) return -1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + return active_or_keyed_escalar_bank(ctx, scalar_host_key, + (size_t)ex[0] * ex[1] * ex[2], + upload_if_missing); +} + +static int copy_escalar_batch_host(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const bool unpack = (kind == cudaMemcpyHostToDevice); + const int bank = active_or_keyed_escalar_bank(ctx, scalar_host_key, + (size_t)ex[0] * ex[1] * ex[2], + unpack); + if (bank < 0 || (!unpack && !ctx.escalar_valid[bank])) return 1; + const int region_all = sx * sy * sz; + double *d_comm = ensure_step_comm_buffer(ctx, (size_t)ESCALAR_FIELD_COUNT * region_all); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)ESCALAR_FIELD_COUNT); + if (kind == cudaMemcpyDeviceToHost) { + kern_pack_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], d_comm, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, + (size_t)ESCALAR_FIELD_COUNT * region_all * sizeof(double), + cudaMemcpyDeviceToHost)); + } else { + CUDA_CHECK(cudaMemcpy(d_comm, host_buffer, + (size_t)ESCALAR_FIELD_COUNT * region_all * sizeof(double), + cudaMemcpyHostToDevice)); + kern_unpack_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], d_comm, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + mark_escalar_output_valid(ctx, bank); + } + return 0; +} + +static int copy_escalar_batch_device(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + bool pack) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = active_or_keyed_escalar_bank(ctx, scalar_host_key, + (size_t)ex[0] * ex[1] * ex[2], + !pack); + if (bank < 0 || (pack && !ctx.escalar_valid[bank])) return 1; + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)ESCALAR_FIELD_COUNT); + if (pack) { + kern_pack_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], device_buffer, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + } else { + kern_unpack_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], device_buffer, + ex[0], ex[1], i0, j0, k0, sx, sy, sz, + region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + mark_escalar_output_valid(ctx, bank); + } + return 0; +} + +static int restrict_escalar_batch(void *block_tag, + double **scalar_host_key, + void *buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *scalar_soa, + bool device_buffer) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = active_or_keyed_escalar_bank(ctx, scalar_host_key, + (size_t)ex[0] * ex[1] * ex[2], + false); + if (bank < 0 || !ctx.escalar_valid[bank]) return 1; + const int region_all = sx * sy * sz; + double *d_comm = device_buffer ? (double *)buffer : + ensure_step_comm_buffer(ctx, (size_t)ESCALAR_FIELD_COUNT * region_all); + upload_comm_state_soa(scalar_soa, ESCALAR_FIELD_COUNT); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)ESCALAR_FIELD_COUNT); + kern_restrict_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], d_comm, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + if (!device_buffer) { + CUDA_CHECK(cudaMemcpy(buffer, d_comm, + (size_t)ESCALAR_FIELD_COUNT * region_all * sizeof(double), + cudaMemcpyDeviceToHost)); + } + return 0; +} + +static int prolong_escalar_batch(void *block_tag, + double **scalar_host_key, + void *buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *scalar_soa, + bool device_buffer) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = active_or_keyed_escalar_bank(ctx, scalar_host_key, + (size_t)ex[0] * ex[1] * ex[2], + false); + if (bank < 0 || !ctx.escalar_valid[bank]) return 1; + const int region_all = sx * sy * sz; + double *d_comm = device_buffer ? (double *)buffer : + ensure_step_comm_buffer(ctx, (size_t)ESCALAR_FIELD_COUNT * region_all); + upload_comm_state_soa(scalar_soa, ESCALAR_FIELD_COUNT); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)ESCALAR_FIELD_COUNT); + kern_prolong_state_region_batch<<>>( + ctx.d_escalar_resident_mem[bank], d_comm, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, ESCALAR_FIELD_COUNT, + ex[0] * ex[1] * ex[2]); + if (!device_buffer) { + CUDA_CHECK(cudaMemcpy(buffer, d_comm, + (size_t)ESCALAR_FIELD_COUNT * region_all * sizeof(double), + cudaMemcpyDeviceToHost)); + } + return 0; +} + +extern "C" +int bssn_cuda_escalar_has_resident_fields(void *block_tag, + double *Sphi_host, + double *Spi_host) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return 0; + double *key[ESCALAR_FIELD_COUNT] = { Sphi_host, Spi_host }; + const int bank = find_escalar_bank(it->second, key); + return (bank >= 0 && it->second.escalar_valid[bank]) ? 1 : 0; +} + +extern "C" +int bssn_cuda_escalar_has_any_resident_fields(void *block_tag) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return 0; + StepContext &ctx = it->second; + if (ctx.current_escalar_bank >= 0 && + ctx.current_escalar_bank < ESCALAR_RESIDENT_BANK_COUNT && + ctx.escalar_valid[ctx.current_escalar_bank]) + return 1; + for (int b = 0; b < ESCALAR_RESIDENT_BANK_COUNT; ++b) { + if (ctx.escalar_valid[b]) + return 1; + } + return 0; +} + +extern "C" +int bssn_cuda_escalar_download_fields_if_present(void *block_tag, + int *ex, + double *Sphi_host, + double *Spi_host) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return 0; + double *key[ESCALAR_FIELD_COUNT] = { Sphi_host, Spi_host }; + StepContext &ctx = it->second; + const int bank = find_escalar_bank(ctx, key); + if (bank < 0 || !ctx.escalar_valid[bank]) return 0; + if (ctx.escalar_host_clean[bank]) return 0; + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpyAsync(Sphi_host, ctx.d_escalar_resident[bank][0], + bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(Spi_host, ctx.d_escalar_resident[bank][1], + bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.escalar_host_clean[bank] = true; + return 0; +} + +extern "C" +int bssn_cuda_pack_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + return copy_escalar_batch_host(block_tag, scalar_host_key, host_buffer, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost); +} + +extern "C" +int bssn_cuda_unpack_escalar_batch_from_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + return copy_escalar_batch_host(block_tag, scalar_host_key, host_buffer, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice); +} + +extern "C" +int bssn_cuda_pack_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + return copy_escalar_batch_device(block_tag, scalar_host_key, device_buffer, ex, + i0, j0, k0, sx, sy, sz, true); +} + +extern "C" +int bssn_cuda_unpack_escalar_batch_from_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + return copy_escalar_batch_device(block_tag, scalar_host_key, device_buffer, ex, + i0, j0, k0, sx, sy, sz, false); +} + +extern "C" +int bssn_cuda_restrict_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *scalar_soa) +{ + return restrict_escalar_batch(block_tag, scalar_host_key, host_buffer, ex, + sx, sy, sz, fi0, fj0, fk0, scalar_soa, false); +} + +extern "C" +int bssn_cuda_prolong_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *scalar_soa) +{ + return prolong_escalar_batch(block_tag, scalar_host_key, host_buffer, ex, + sx, sy, sz, ii0, jj0, kk0, + lbc_i, lbc_j, lbc_k, scalar_soa, false); +} + +extern "C" +int bssn_cuda_restrict_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *scalar_soa) +{ + return restrict_escalar_batch(block_tag, scalar_host_key, device_buffer, ex, + sx, sy, sz, fi0, fj0, fk0, scalar_soa, true); +} + +extern "C" +int bssn_cuda_prolong_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *scalar_soa) +{ + return prolong_escalar_batch(block_tag, scalar_host_key, device_buffer, ex, + sx, sy, sz, ii0, jj0, kk0, + lbc_i, lbc_j, lbc_k, scalar_soa, true); +} + extern "C" int bssn_cuda_download_state_subset(void *block_tag, int *ex, @@ -8277,6 +8900,69 @@ int bssn_cuda_upload_state_subset(void *block_tag, return 0; } +extern "C" +int bssn_cuda_prepare_escalar_inter_time_level(void *block_tag, + int *ex, + double **src1_host_key, + double **src2_host_key, + double **src3_host_key, + double **dst_host_key, + int source_count, + int tindex) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (source_count != 2 && source_count != 3) return 1; + if (!escalar_key_usable(src1_host_key) || + !escalar_key_usable(src2_host_key) || + !escalar_key_usable(dst_host_key)) + return 1; + if (source_count == 3 && !escalar_key_usable(src3_host_key)) + return 1; + + double c1 = 0.0, c2 = 0.0, c3 = 0.0; + if (source_count == 2) { + if (tindex == 0) { + c1 = 0.5; c2 = 0.5; + } else if (tindex == 1) { + c1 = 0.75; c2 = 0.25; + } else if (tindex == -1) { + c1 = 0.25; c2 = 0.75; + } else { + return 1; + } + } else { + if (tindex == 0) { + c1 = 3.0 / 8.0; c2 = 3.0 / 4.0; c3 = -1.0 / 8.0; + } else if (tindex == 1 || tindex == -1) { + c1 = 5.0 / 32.0; c2 = 15.0 / 16.0; c3 = -3.0 / 32.0; + } else { + return 1; + } + } + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + StepContext &ctx = ensure_step_ctx(block_tag, all); + const int src1_bank = ensure_escalar_bank(ctx, src1_host_key, all, true); + const int src2_bank = ensure_escalar_bank(ctx, src2_host_key, all, true, src1_bank); + const int src3_bank = (source_count == 3) + ? ensure_escalar_bank(ctx, src3_host_key, all, true, src1_bank) + : -1; + const int dst_bank = reserve_escalar_output_bank(ctx, dst_host_key, all, src1_bank); + if (src1_bank < 0 || src2_bank < 0 || (source_count == 3 && src3_bank < 0) || dst_bank < 0) + return 1; + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)ESCALAR_FIELD_COUNT); + kern_prepare_inter_time_level<<>>( + ctx.d_escalar_resident_mem[src1_bank], + ctx.d_escalar_resident_mem[src2_bank], + (source_count == 3) ? ctx.d_escalar_resident_mem[src3_bank] : nullptr, + ctx.d_escalar_resident_mem[dst_bank], + c1, c2, c3, ESCALAR_FIELD_COUNT, (int)all); + mark_escalar_output_valid(ctx, dst_bank); + return 0; +} + extern "C" int bssn_cuda_prepare_inter_time_level(void *block_tag, int *ex, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 7ac103b..2a5e892 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -84,6 +84,88 @@ int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag, double &eps, int &precor); +int bssn_cuda_escalar_has_resident_fields(void *block_tag, + double *Sphi_host, + double *Spi_host); + +int bssn_cuda_escalar_has_any_resident_fields(void *block_tag); + +int bssn_cuda_escalar_download_fields_if_present(void *block_tag, + int *ex, + double *Sphi_host, + double *Spi_host); + +int bssn_cuda_pack_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_unpack_escalar_batch_from_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_pack_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_unpack_escalar_batch_from_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_restrict_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *scalar_soa); + +int bssn_cuda_prolong_escalar_batch_to_host_buffer(void *block_tag, + double **scalar_host_key, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *scalar_soa); + +int bssn_cuda_restrict_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *scalar_soa); + +int bssn_cuda_prolong_escalar_batch_to_device_buffer(void *block_tag, + double **scalar_host_key, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *scalar_soa); + +int bssn_cuda_prepare_escalar_inter_time_level(void *block_tag, + int *ex, + double **src1_host_key, + double **src2_host_key, + double **src3_host_key, + double **dst_host_key, + int source_count, + int tindex); + int bssn_cuda_copy_state_region_to_host(void *block_tag, int state_index, double *host_state, @@ -203,6 +285,14 @@ int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag int fi0, int fj0, int fk0, const double *state_soa); +int bssn_cuda_restrict_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa); + int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, double **state_host_key, int state_count, @@ -213,6 +303,15 @@ int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, int lbc_i, int lbc_j, int lbc_k, const double *state_soa); +int bssn_cuda_prolong_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); + int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer,