diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 5ebb1b2..8140788 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -123,6 +123,85 @@ bool parallel_gpu_pack_segments(double *data, return true; } +bool parallel_can_gpu_unpack_segments(MyList *src, MyList *dst, + int rank_in, MyList *VarLists, MyList *VarListd) +{ + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (!src || !dst) + return false; + + if (src->data->Bg->lev != dst->data->Bg->lev) + return false; + + while (src && dst) + { + if ((src->data->Bg->rank == rank_in) && (dst->data->Bg->rank == myrank)) + { + MyList *varls = VarLists; + MyList *varld = VarListd; + while (varls && varld) + { + (void)varls; + if (!bssn_gpu_find_device_buffer(dst->data->Bg->fgfs[varld->data->sgfn])) + return false; + varls = varls->next; + varld = varld->next; + } + if (varls || varld) + return false; + } + src = src->next; + dst = dst->next; + } + return true; +} + +bool parallel_gpu_unpack_segments(const double *data, + MyList *src, MyList *dst, + int rank_in, MyList *VarLists, MyList *VarListd) +{ + if (!data) + return false; + + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (!src || !dst) + return false; + if (src->data->Bg->lev != dst->data->Bg->lev) + return false; + + int size_out = 0; + while (src && dst) + { + if ((src->data->Bg->rank == rank_in) && (dst->data->Bg->rank == myrank)) + { + MyList *varls = VarLists; + MyList *varld = VarListd; + while (varls && varld) + { + (void)varls; + if (bssn_gpu_stage_upload_buffer_to_region(data + size_out, + dst->data->Bg->fgfs[varld->data->sgfn], + dst->data->Bg->shape, + dst->data->Bg->bbox, + dst->data->Bg->bbox + dim, + dst->data->shape, + dst->data->llb)) + return false; + size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + varls = varls->next; + varld = varld->next; + } + if (varls || varld) + return false; + } + src = src->next; + dst = dst->next; + } + return true; +} + void parallel_report_mpi_error(const char *context, int errcode, int req_no) { char errstr[MPI_MAX_ERROR_STRING]; @@ -4996,23 +5075,28 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.lengths_valid = true; } // Sync_finish: progressive unpack as receives complete, then wait for sends -void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, - MyList *VarList, int Symmetry) -{ - if (!state.active) - return; +void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry, bool unpack_to_host) +{ + if (!state.active) + return; int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; - - // Unpack local data first (no MPI needed) - if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); - - // Progressive unpack of remote receives - if (state.pending_recv > 0 && state.req_no > 0) + + // Unpack local data first (no MPI needed) + if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) + { + if (unpack_to_host || + !parallel_can_gpu_unpack_segments(src[myrank], dst[myrank], myrank, VarList, VarList) || + !parallel_gpu_unpack_segments(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, VarList, VarList)) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + } + + // Progressive unpack of remote receives + if (state.pending_recv > 0 && state.req_no > 0) { int pending = state.pending_recv; int *completed = new int[cache.max_reqs]; @@ -5025,13 +5109,16 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, for (int i = 0; i < outcount; i++) { int idx = completed[i]; - if (idx >= 0 && state.req_is_recv[idx]) - { - int recv_node = state.req_node[idx]; - data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); - pending--; - } - } + if (idx >= 0 && state.req_is_recv[idx]) + { + int recv_node = state.req_node[idx]; + if (unpack_to_host || + !parallel_can_gpu_unpack_segments(src[recv_node], dst[recv_node], recv_node, VarList, VarList) || + !parallel_gpu_unpack_segments(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, VarList, VarList)) + data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + pending--; + } + } } delete[] completed; } diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index f115af5..e1e3784 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -134,10 +134,10 @@ namespace Parallel AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {} }; - void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, - SyncCache &cache, AsyncSyncState &state); - void Sync_finish(SyncCache &cache, AsyncSyncState &state, - MyList *VarList, int Symmetry); + void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state); + void Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry, bool unpack_to_host = true); void OutBdLow2Hi(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 4be8d1e..51f488b 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -56,6 +56,11 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) const bool BB = fgt(PhysTime, StartTime, dT_lev / 2); (void)BB; +#if (MAPBH == 0) + const bool need_host_stage_sync = (BH_num > 0 && lev == GH->levels - 1); +#else + const bool need_host_stage_sync = false; +#endif double ndeps = (lev < GH->movls) ? numepsb : numepss; double TRK4 = PhysTime; int iter_count = 0; @@ -372,8 +377,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); - if (!ERROR) + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, need_host_stage_sync); + if (!ERROR && need_host_stage_sync) refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); @@ -465,8 +470,9 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); - if (!ERROR && iter_count < 3) + const bool unpack_cor_to_host = (iter_count == 3) || need_host_stage_sync; + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, unpack_cor_to_host); + if (!ERROR && iter_count < 3 && unpack_cor_to_host) refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 730bdec..72eccc5 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -1167,6 +1167,74 @@ int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr, return 0; } +int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr, + double *host_dst_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb) +{ + if (!host_src_ptr || !host_dst_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb) + return 1; + + const double *device_ptr = bssn_gpu_find_device_buffer(host_dst_ptr); + if (!device_ptr) + return 1; + + int start[3] = {0, 0, 0}; + for (int i = 0; i < 3; ++i) + { + if (full_shape[i] <= 0 || region_shape[i] <= 0) + return 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i] - 1); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#ifdef Cell + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i]); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + + if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i]) + return 1; + } + + cudaMemcpy3DParms parms = {}; + parms.srcPtr = make_cudaPitchedPtr(const_cast(host_src_ptr), + static_cast(region_shape[0]) * sizeof(double), + static_cast(region_shape[0]), + static_cast(region_shape[1])); + parms.dstPtr = make_cudaPitchedPtr(const_cast(device_ptr), + static_cast(full_shape[0]) * sizeof(double), + static_cast(full_shape[0]), + static_cast(full_shape[1])); + parms.srcPos = make_cudaPos(0, 0, 0); + parms.dstPos = make_cudaPos(static_cast(start[0]) * sizeof(double), + static_cast(start[1]), + static_cast(start[2])); + parms.extent = make_cudaExtent(static_cast(region_shape[0]) * sizeof(double), + static_cast(region_shape[1]), + static_cast(region_shape[2])); + parms.kind = cudaMemcpyHostToDevice; + + cudaError_t err = cudaMemcpy3D(&parms); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy3D(H2D buffer->region) failed: " << cudaGetErrorString(err) << endl; + return 1; + } + + return 0; +} + __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index 1e89ca0..bb1e50e 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -91,6 +91,13 @@ int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr, const int *region_shape, const double *region_llb, double *host_dst_ptr); +int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr, + double *host_dst_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb); /** Init GPU side data in GPUMeta. */ // void init_fluid_meta_gpu(GPUMeta *gpu_meta);