From 090d8657aec8d2f251be9053b80d8a9eb53a55ec Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 29 Apr 2026 18:34:31 +0800 Subject: [PATCH] Optimize BSSN CUDA state transfers --- AMSS_NCKU_source/Parallel.C | 125 ++++++++++++++++++ AMSS_NCKU_source/bssn_rhs_cuda.cu | 202 ++++++++++++++++++++++++++++-- AMSS_NCKU_source/bssn_rhs_cuda.h | 14 +++ 3 files changed, 332 insertions(+), 9 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 6760047..898f7aa 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -6,6 +6,7 @@ #include "parameters.h" #include #include +#include #ifndef USE_CUDA_Z4C #define USE_CUDA_Z4C 0 @@ -391,6 +392,113 @@ bool cuda_device_state_count_supported(int state_count) #endif } +#if USE_CUDA_BSSN +bool cuda_flush_device_segment_batch(Block *block, + double *data, + int state_count, + const std::vector &meta, + int dir) +{ + if (!block || meta.empty()) + return true; + const int segment_count = (int)(meta.size() / 8); + if (dir == PACK) + return bssn_cuda_pack_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return bssn_cuda_unpack_state_segments_from_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; +} + +int cuda_data_packer_device_batched(double *data, + MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + MyList *VarListd, + int Symmetry) +{ + (void)Symmetry; + if (!data || (dir != PACK && dir != UNPACK) || !src || !dst) + return -1; + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + const int state_count = cuda_state_var_count(VarLists, VarListd); + if (!cuda_device_state_count_supported(state_count)) + return -1; + + int size_out = 0; + Block *batch_block = 0; + std::vector batch_meta; + batch_meta.reserve(64); + + while (src && dst) + { + const bool active = + (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank); + if (active) + { + int type; + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + if (type != 1) + return -1; + + Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg; + if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) || + (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type))) + return -1; + + if (batch_block && batch_block != block) + { + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir)) + return -1; + batch_meta.clear(); + } + batch_block = block; + + const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0) + : cuda_seg_begin(dst->data, block, 0); + const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1) + : cuda_seg_begin(dst->data, block, 1); + const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2) + : cuda_seg_begin(dst->data, block, 2); + const int sx = dst->data->shape[0]; + const int sy = dst->data->shape[1]; + const int sz = dst->data->shape[2]; + const int region_all = sx * sy * sz; + + batch_meta.push_back(i0); + batch_meta.push_back(j0); + batch_meta.push_back(k0); + batch_meta.push_back(sx); + batch_meta.push_back(sy); + batch_meta.push_back(sz); + batch_meta.push_back(region_all); + batch_meta.push_back(size_out); + + size_out += state_count * region_all; + } + src = src->next; + dst = dst->next; + } + + if (batch_block) + { + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir)) + return -1; + } + return size_out; +} +#endif + bool cuda_segments_same_level(MyList *src, MyList *dst, int rank_in, @@ -465,6 +573,23 @@ int data_packer_with_device_buffer(double *data, MyList *VarListd, int Symmetry) { +#if USE_CUDA_BSSN + const double batched_t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int batched = cuda_data_packer_device_batched(data, src, dst, rank_in, dir, + VarLists, VarListd, Symmetry); + if (batched >= 0) + { + if (sync_profile_enabled()) + { + const double dt = MPI_Wtime() - batched_t0; + if (dir == PACK) + sync_profile_stats().direct_pack_sec += dt; + else if (dir == UNPACK) + sync_profile_stats().direct_unpack_sec += dt; + } + return batched; + } +#endif s_cuda_aware_pack_active = true; int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry); s_cuda_aware_pack_active = false; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index c818792..f84be43 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -74,6 +74,12 @@ struct CudaProfileStats { double bc_ms; double finalize_ms; double output_ms; + long long upload_calls; + long long resident_download_calls; + double upload_ms; + double resident_download_ms; + double upload_gb; + double resident_download_gb; }; enum RhsStageId { @@ -97,7 +103,10 @@ struct RhsStageProfileStats { }; static CudaProfileStats &cuda_profile_stats() { - static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + static CudaProfileStats stats = { + 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0, 0, 0.0, 0.0, 0.0, 0.0 + }; return stats; } @@ -162,7 +171,8 @@ static void cuda_profile_maybe_log() { CudaProfileStats &stats = cuda_profile_stats(); if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return; fprintf(stderr, - "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n", + "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms" + " uploads=%lld avg_upload=%.3f ms upload_GB=%.3f resident_downloads=%lld avg_resident_download=%.3f ms resident_download_GB=%.3f\n", g_dispatch.my_rank, g_dispatch.my_device, stats.calls, stats.total_ms / (double)stats.calls, stats.state_ms / (double)stats.calls, @@ -170,7 +180,13 @@ static void cuda_profile_maybe_log() { stats.rhs_ms / (double)stats.calls, stats.bc_ms / (double)stats.calls, stats.finalize_ms / (double)stats.calls, - stats.output_ms / (double)stats.calls); + stats.output_ms / (double)stats.calls, + stats.upload_calls, + stats.upload_calls ? stats.upload_ms / (double)stats.upload_calls : 0.0, + stats.upload_gb, + stats.resident_download_calls, + stats.resident_download_calls ? stats.resident_download_ms / (double)stats.resident_download_calls : 0.0, + stats.resident_download_gb); fflush(stderr); } @@ -542,6 +558,8 @@ struct StepAllocation { static std::unordered_map g_step_ctx; static std::vector g_step_pool; +static int *g_comm_segment_meta = nullptr; +static size_t g_comm_segment_meta_cap = 0; static StepAllocation empty_step_allocation() { @@ -760,6 +778,20 @@ static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doub return ctx.h_comm_mem; } +static int *ensure_comm_segment_meta_buffer(size_t needed_ints) +{ + if (needed_ints == 0) return nullptr; + if (g_comm_segment_meta_cap < needed_ints) { + if (g_comm_segment_meta) { + CUDA_CHECK(cudaFree(g_comm_segment_meta)); + g_comm_segment_meta = nullptr; + } + CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int))); + g_comm_segment_meta_cap = needed_ints; + } + return g_comm_segment_meta; +} + static void upload_grid_params_if_needed(const GridParams &gp) { if (!g_gp_host_cache_valid || @@ -4716,18 +4748,25 @@ static void compute_patch_boundary_flags(int *ex, static void upload_state_inputs(double **state_host, size_t all) { const size_t bytes = all * sizeof(double); + const bool profile = cuda_profile_enabled(); + const double t0 = profile ? cuda_profile_now_ms() : 0.0; static int direct_upload = -1; if (direct_upload < 0) { const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD"); - const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS"); - direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) - : ((pin_env && atoi(pin_env) != 0) ? 1 : 0); + direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1; } if (direct_upload) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i], bytes, cudaMemcpyHostToDevice)); } + if (profile) { + cuda_profile_sync(); + CudaProfileStats &stats = cuda_profile_stats(); + stats.upload_calls++; + stats.upload_ms += cuda_profile_now_ms() - t0; + stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + } return; } for (int i = 0; i < BSSN_STATE_COUNT; ++i) { @@ -4736,6 +4775,12 @@ static void upload_state_inputs(double **state_host, size_t all) CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyHostToDevice)); + if (profile) { + CudaProfileStats &stats = cuda_profile_stats(); + stats.upload_calls++; + stats.upload_ms += cuda_profile_now_ms() - t0; + stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + } } static void upload_matter_cache(StepContext &ctx, @@ -5168,6 +5213,62 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, } } +__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int i0 = m[0], j0 = m[1], k0 = m[2]; + const int sx = m[3], sy = m[4]; + const int region_all = m[6]; + const int offset = m[7]; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst[(size_t)offset + (size_t)state_index * region_all + local] = + src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem, + const double * __restrict__ src, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int i0 = m[0], j0 = m[1], k0 = m[2]; + const int sx = m[3], sy = m[4]; + const int region_all = m[6]; + const int offset = m[7]; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)offset + (size_t)state_index * region_all + local]; + } +} + __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, double * __restrict__ dst, int subset_count, @@ -5308,12 +5409,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); + const bool profile = cuda_profile_enabled(); + const double t0 = profile ? cuda_profile_now_ms() : 0.0; static int direct_download = -1; if (direct_download < 0) { const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD"); - const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS"); - direct_download = env ? ((atoi(env) != 0) ? 1 : 0) - : ((pin_env && atoi(pin_env) != 0) ? 1 : 0); + direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1; } if (direct_download) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { @@ -5321,6 +5422,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); + if (profile) { + CudaProfileStats &stats = cuda_profile_stats(); + stats.resident_download_calls++; + stats.resident_download_ms += cuda_profile_now_ms() - t0; + stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + } return; } CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, @@ -5329,6 +5436,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } + if (profile) { + CudaProfileStats &stats = cuda_profile_stats(); + stats.resident_download_calls++; + stats.resident_download_ms += cuda_profile_now_ms() - t0; + stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + } } static void copy_state_subset(void *block_tag, @@ -6060,6 +6173,47 @@ static void copy_state_device_batch(void *block_tag, } } +static void copy_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + int pack_not_unpack) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 8; + if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return; + if (m[6] > max_region_all) max_region_all = m[6]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 8 * sizeof(int), + cudaMemcpyHostToDevice)); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + if (pack_not_unpack) { + kern_pack_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); + } else { + kern_unpack_state_segments_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); + } +} + extern "C" int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -6090,6 +6244,36 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 1); + return 0; +} + +extern "C" +int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 0); + return 0; +} + extern "C" int bssn_cuda_download_state_subset(void *block_tag, int *ex, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 12e190a..37b6ab4 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -118,6 +118,20 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + +int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + int bssn_cuda_download_state_subset(void *block_tag, int *ex, int subset_count,