From 4484635f0d9197547744c073e940cd1727d9b4ca Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 18:49:11 +0800 Subject: [PATCH] Pack sync send buffers directly from GPU state --- AMSS_NCKU_source/Parallel.C | 138 ++++++++++++++++++++++++------ AMSS_NCKU_source/bssn_cuda_step.C | 25 +++++- AMSS_NCKU_source/bssn_gpu.cu | 68 +++++++++++++++ AMSS_NCKU_source/bssn_gpu.h | 7 ++ 4 files changed, 210 insertions(+), 28 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 3ecb14b..5ebb1b2 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3,6 +3,7 @@ #include "fmisc.h" #include "prolongrestrict.h" #include "bssn_cuda_ops.h" +#include "bssn_gpu.h" #include "misc.h" #include "parameters.h" #include @@ -43,6 +44,85 @@ struct ParallelTransferContextGuard } }; +bool parallel_can_gpu_pack_segments(MyList *src, MyList *dst, + int rank_in, MyList *VarLists, MyList *VarListd) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (!src || !dst) + return false; + + if (src->data->Bg->lev != dst->data->Bg->lev) + return false; + + while (src && dst) + { + if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank)) + { + MyList *varls = VarLists; + MyList *varld = VarListd; + while (varls && varld) + { + (void)varld; + if (!bssn_gpu_find_device_buffer(src->data->Bg->fgfs[varls->data->sgfn])) + return false; + varls = varls->next; + varld = varld->next; + } + if (varls || varld) + return false; + } + src = src->next; + dst = dst->next; + } + return true; +} + +bool parallel_gpu_pack_segments(double *data, + MyList *src, MyList *dst, + int rank_in, MyList *VarLists, MyList *VarListd) +{ + if (!data) + return false; + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (!src || !dst) + return false; + if (src->data->Bg->lev != dst->data->Bg->lev) + return false; + + int size_out = 0; + while (src && dst) + { + if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank)) + { + MyList *varls = VarLists; + MyList *varld = VarListd; + while (varls && varld) + { + (void)varld; + if (bssn_gpu_stage_download_region_to_buffer(src->data->Bg->fgfs[varls->data->sgfn], + src->data->Bg->shape, + src->data->Bg->bbox, + src->data->Bg->bbox + dim, + dst->data->shape, + dst->data->llb, + data + size_out)) + return false; + size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + varls = varls->next; + varld = varld->next; + } + if (varls || varld) + return false; + } + src = src->next; + dst = dst->next; + } + return true; +} + void parallel_report_mpi_error(const char *context, int errcode, int req_no) { char errstr[MPI_MAX_ERROR_STRING]; @@ -4843,27 +4923,30 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr for (int node = 0; node < cpusize; node++) { - if (node == myrank) - { - int length; + if (node == myrank) + { + int length; if (!cache.lengths_valid) { length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); cache.recv_lengths[node] = length; } else { length = cache.recv_lengths[node]; } - if (length > 0) - { - if (length > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[length]; - cache.recv_buf_caps[node] = length; - } - data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - } - } - else + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + bssn_gpu_prepare_host_buffer(cache.recv_bufs[node], length); + if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) || + !parallel_gpu_pack_segments(cache.recv_bufs[node], src[myrank], dst[myrank], node, VarList, VarList)) + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + } + else { int slength; if (!cache.lengths_valid) { @@ -4872,17 +4955,20 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr } else { slength = cache.send_lengths[node]; } - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - state.req_node[state.req_no] = node; - state.req_is_recv[state.req_no] = 0; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + bssn_gpu_prepare_host_buffer(cache.send_bufs[node], slength); + if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) || + !parallel_gpu_pack_segments(cache.send_bufs[node], src[myrank], dst[myrank], node, VarList, VarList)) + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 0; MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++); } int rlength; diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index e4a759a..4be8d1e 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -306,6 +306,26 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) return true; }; + auto can_pack_sync_from_device = + [&](MyList *var_list, Parallel::SyncCache &cache) -> bool { + if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize) + return false; + + MyList *seg = cache.combined_src[myrank]; + while (seg) + { + MyList *var_it = var_list; + while (var_it) + { + if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn])) + return false; + var_it = var_it->next; + } + seg = seg->next; + } + return true; + }; + MyList *Pp = GH->PatL[lev]; while (Pp) { @@ -341,7 +361,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Pp = Pp->next; } - if (!ERROR && sync_cache_pre[lev].valid) + if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev])) refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]); MPI_Request err_req_pre; @@ -433,7 +453,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Pp = Pp->next; } - if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3) + if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 && + !can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev])) refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]); MPI_Request err_req_cor; diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index efcef1f..730bdec 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -1099,6 +1099,74 @@ int bssn_gpu_stage_download_region(double *host_ptr, return 0; } +int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb, + double *host_dst_ptr) +{ + if (!host_src_ptr || !host_dst_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb) + return 1; + + const double *device_ptr = bssn_gpu_find_device_buffer(host_src_ptr); + if (!device_ptr) + return 1; + + int start[3] = {0, 0, 0}; + for (int i = 0; i < 3; ++i) + { + if (full_shape[i] <= 0 || region_shape[i] <= 0) + return 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i] - 1); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#ifdef Cell + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i]); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + + if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i]) + return 1; + } + + cudaMemcpy3DParms parms = {}; + parms.srcPtr = make_cudaPitchedPtr(const_cast(device_ptr), + static_cast(full_shape[0]) * sizeof(double), + static_cast(full_shape[0]), + static_cast(full_shape[1])); + parms.dstPtr = make_cudaPitchedPtr(host_dst_ptr, + static_cast(region_shape[0]) * sizeof(double), + static_cast(region_shape[0]), + static_cast(region_shape[1])); + parms.srcPos = make_cudaPos(static_cast(start[0]) * sizeof(double), + static_cast(start[1]), + static_cast(start[2])); + parms.dstPos = make_cudaPos(0, 0, 0); + parms.extent = make_cudaExtent(static_cast(region_shape[0]) * sizeof(double), + static_cast(region_shape[1]), + static_cast(region_shape[2])); + parms.kind = cudaMemcpyDeviceToHost; + + cudaError_t err = cudaMemcpy3D(&parms); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy3D(D2H region->buffer) failed: " << cudaGetErrorString(err) << endl; + return 1; + } + + return 0; +} + __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index d4561e8..1e89ca0 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -84,6 +84,13 @@ int bssn_gpu_stage_download_region(double *host_ptr, const double *full_uub, const int *region_shape, const double *region_llb); +int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb, + double *host_dst_ptr); /** Init GPU side data in GPUMeta. */ // void init_fluid_meta_gpu(GPUMeta *gpu_meta);