diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 2b3d08e..3950a2e 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -138,6 +138,111 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) } }; + auto ensure_stage_device_var_list = + [&](Block *cg, MyList *var_list) { + const int n = cg->shape[0] * cg->shape[1] * cg->shape[2]; + while (var_list) + { + double *host_ptr = cg->fgfs[var_list->data->sgfn]; + if (!bssn_gpu_find_device_buffer(host_ptr) && + bssn_gpu_stage_upload_buffer(host_ptr, n)) + { + cerr << "GPU state ensure failure: lev=" << lev + << " var=" << var_list->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + var_list = var_list->next; + } + }; + + auto refresh_synced_device_regions = + [&](Block *cg, MyList *var_list, Parallel::SyncCache &cache) { + std::vector local_segments; + for (int node = 0; node < cache.cpusize; ++node) + { + MyList *seg = cache.combined_dst[node]; + while (seg) + { + if (seg->data && seg->data->Bg == cg) + local_segments.push_back(seg->data); + seg = seg->next; + } + } + + if (local_segments.empty()) + return; + + const int n = cg->shape[0] * cg->shape[1] * cg->shape[2]; + while (var_list) + { + double *host_ptr = cg->fgfs[var_list->data->sgfn]; + if (!bssn_gpu_find_device_buffer(host_ptr)) + { + if (bssn_gpu_stage_upload_buffer(host_ptr, n)) + { + cerr << "GPU sync refresh upload failure: lev=" << lev + << " var=" << var_list->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + } + else + { + for (size_t i = 0; i < local_segments.size(); ++i) + { + Parallel::gridseg *seg = local_segments[i]; + if (bssn_gpu_stage_upload_region(host_ptr, + cg->shape, + cg->bbox, + cg->bbox + dim, + seg->shape, + seg->llb)) + { + cerr << "GPU sync region refresh failure: lev=" << lev + << " var=" << var_list->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + } + if (ERROR) + break; + } + var_list = var_list->next; + } + }; + + auto refresh_stage_device_after_sync = + [&](MyList *var_list, Parallel::SyncCache &cache) { + MyList *patch_it = GH->PatL[lev]; + while (patch_it) + { + MyList *block_it = patch_it->data->blb; + while (block_it) + { + Block *cg = block_it->data; + if (myrank == cg->rank) + refresh_synced_device_regions(cg, var_list, cache); + + if (block_it == patch_it->data->ble) + break; + block_it = block_it->next; + } + if (ERROR) + break; + patch_it = patch_it->next; + } + }; + MyList *Pp = GH->PatL[lev]; while (Pp) { @@ -182,7 +287,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); - bssn_gpu_clear_cached_device_buffers(); + if (!ERROR) + refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); if (ERROR) @@ -234,7 +340,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { - stage_upload_var_list(cg, SynchList_pre); + ensure_stage_device_var_list(cg, SynchList_pre); if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) ERROR = 1; @@ -270,7 +376,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); - bssn_gpu_clear_cached_device_buffers(); + if (!ERROR && iter_count < 3) + refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index bcdf57e..fc99f64 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -847,6 +847,73 @@ int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count) return prepare_owned_buffer(host_ptr, static_cast(count), true) ? 0 : 1; } +int bssn_gpu_stage_upload_region(const double *host_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb) +{ + if (!host_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb) + return 1; + + const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr); + if (!device_ptr) + return 1; + + int start[3] = {0, 0, 0}; + for (int i = 0; i < 3; ++i) + { + if (full_shape[i] <= 0 || region_shape[i] <= 0) + return 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i] - 1); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#ifdef Cell + const double dx = (full_uub[i] - full_llb[i]) / static_cast(full_shape[i]); + start[i] = static_cast((region_llb[i] - full_llb[i]) / dx + 0.4); +#else +#error Not define Vertex nor Cell +#endif +#endif + + if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i]) + return 1; + } + + cudaMemcpy3DParms parms = {}; + parms.srcPtr = make_cudaPitchedPtr(const_cast(host_ptr), + static_cast(full_shape[0]) * sizeof(double), + static_cast(full_shape[0]), + static_cast(full_shape[1])); + parms.dstPtr = make_cudaPitchedPtr(const_cast(device_ptr), + static_cast(full_shape[0]) * sizeof(double), + static_cast(full_shape[0]), + static_cast(full_shape[1])); + parms.srcPos = make_cudaPos(static_cast(start[0]) * sizeof(double), + static_cast(start[1]), + static_cast(start[2])); + parms.dstPos = parms.srcPos; + parms.extent = make_cudaExtent(static_cast(region_shape[0]) * sizeof(double), + static_cast(region_shape[1]), + static_cast(region_shape[2])); + parms.kind = cudaMemcpyHostToDevice; + + cudaError_t err = cudaMemcpy3D(&parms); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy3D(H2D region) failed: " << cudaGetErrorString(err) << endl; + return 1; + } + + return 0; +} + __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index d69e8d0..19333c0 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -71,6 +71,12 @@ const double *bssn_gpu_find_device_buffer(const double *host_ptr); void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr); int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count); int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count); +int bssn_gpu_stage_upload_region(const double *host_ptr, + const int *full_shape, + const double *full_llb, + const double *full_uub, + const int *region_shape, + const double *region_llb); /** Init GPU side data in GPUMeta. */ // void init_fluid_meta_gpu(GPUMeta *gpu_meta);