From 1064a68d1680fa13ff7790ae16c8716dd99b65ba Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 7 May 2026 21:38:16 +0800 Subject: [PATCH] Optimize BSSN-EM 8th-order AMR transfers --- AMSS_NCKU_source/Parallel.C | 235 ++++++++++++++++++++++++++++++++++++ makefile_and_run.py | 6 + 2 files changed, 241 insertions(+) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 2d550fa..332f9b5 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data, } return size_out; } + +bool cuda_ensure_bssn_block_resident_for_pack(Block *block, + MyList *vars, + int state_count, + std::vector &uploaded) +{ + if (!block) + return false; + if (bssn_cuda_has_resident_state(block) != 0) + return true; + for (size_t i = 0; i < uploaded.size(); ++i) + { + if (uploaded[i] == block) + return bssn_cuda_has_resident_state(block) != 0; + } + + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + if (!cuda_build_bssn_host_views(block, vars, state_count, views)) + return false; + if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0) + return false; + uploaded.push_back(block); + return bssn_cuda_has_resident_state(block) != 0; +} + +void cuda_host_batch_diag(const char *reason, int state_count, int type) +{ + static int reported = 0; + const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG"); + if (!env || atoi(env) == 0 || reported >= 32) + return; + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + fprintf(stderr, + "[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n", + rank, reason ? reason : "unknown", state_count, type); + fflush(stderr); + reported++; +} + +int cuda_transfer_active_length_if_batched_eligible(MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + int state_count, + int myrank, + std::vector &uploaded) +{ + if (dir != PACK && dir != UNPACK) + { + cuda_host_batch_diag("bad_dir", state_count, -1); + return -1; + } + if (!cuda_device_segment_batch_enabled()) + { + cuda_host_batch_diag("segment_batch_off", state_count, -1); + return -1; + } + if (!cuda_device_state_count_supported(state_count)) + { + cuda_host_batch_diag("unsupported_state_count", state_count, -1); + return -1; + } + if (cuda_amr_restrict_compare_enabled()) + { + cuda_host_batch_diag("compare_enabled", state_count, -1); + return -1; + } + + int total = 0; + bool has_work = false; + bool has_amr = false; + while (src && dst) + { + const bool active = + (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank); + if (active) + { + has_work = true; + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) + { + cuda_host_batch_diag("null_segment", state_count, -1); + return -1; + } + int type; + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + + if (type == 2 || type == 3) + has_amr = true; + if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled()) + { + cuda_host_batch_diag("restrict_batch_off", state_count, type); + return -1; + } + if (dir == PACK) + { + if ((type == 2 || type == 3) && + !cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists, + state_count, uploaded)) + { + cuda_host_batch_diag("resident_upload_failed", state_count, type); + return -1; + } + if (!cuda_can_direct_pack(src->data, dst->data, type)) + { + cuda_host_batch_diag("direct_pack_ineligible", state_count, type); + return -1; + } + } + else + { + if (!cuda_can_direct_unpack(dst->data, type)) + { + cuda_host_batch_diag("direct_unpack_ineligible", state_count, type); + return -1; + } + } + + total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + } + src = src->next; + dst = dst->next; + } + if (!has_work) + return 0; + if (!has_amr) + { + cuda_host_batch_diag("no_amr_segment", state_count, -1); + return -1; + } + return total; +} + +int cuda_data_packer_host_staged_batched(double *host_data, + MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + MyList *VarListd, + int Symmetry) +{ + if (!host_data || !cuda_amr_host_staged_enabled()) + { + cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1); + return -1; + } + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + const int state_count = cuda_state_var_count(VarLists, VarListd); + if (state_count < 0) + { + cuda_host_batch_diag("var_list_mismatch", state_count, -1); + return -1; + } + + std::vector uploaded; + const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in, + dir, VarLists, + state_count, myrank, + uploaded); + if (total <= 0) + return total; + + static double *stage_dev = 0; + static int stage_cap = 0; + if (total > stage_cap) + { + free_device_comm_buffer(stage_dev); + stage_dev = alloc_device_comm_buffer(total); + stage_cap = total; + } + + if (dir == UNPACK) + { + cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double), + cudaMemcpyHostToDevice); + if (h2d != cudaSuccess) + { + fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n", + (int)h2d); + return -1; + } + } + + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir, + VarLists, VarListd, Symmetry); + if (packed != total) + { + cuda_host_batch_diag("device_batched_failed", state_count, -1); + return -1; + } + + if (dir == PACK) + { + cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double), + cudaMemcpyDeviceToHost); + if (d2h != cudaSuccess) + { + fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n", + (int)d2h); + return -1; + } + } + + if (sync_profile_enabled()) + { + const double dt = MPI_Wtime() - t0; + if (dir == PACK) + sync_profile_stats().direct_pack_sec += dt; + else + sync_profile_stats().direct_unpack_sec += dt; + } + return total; +} #endif bool cuda_segments_device_eligible(MyList *src, @@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList *src, MyList

= 0) + return batched; + } +#endif + int type; /* 1 copy, 2 restrict, 3 prolong */ if (src->data->Bg->lev == dst->data->Bg->lev) type = 1; diff --git a/makefile_and_run.py b/makefile_and_run.py index 2edd4ef..6402c3e 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -167,6 +167,12 @@ def _gpu_runtime_env(): "AMSS_INTERP_GPU": "0", "AMSS_CUDA_AWARE_MPI": "0", }) + if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM": + defaults.update({ + "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", + "AMSS_CUDA_AMR_RESTRICT_BATCH": "1", + "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1", + }) if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"): defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1" if getattr(input_data, "Equation_Class", "") == "Z4C":