diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index b0fbb57..dd01b7d 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -145,6 +145,18 @@ int cuda_seg_begin(const Parallel::gridseg *seg, Block *bg, int dir) return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5); } +int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) +{ + int count = 0; + while (src_vars && dst_vars) + { + ++count; + src_vars = src_vars->next; + dst_vars = dst_vars->next; + } + return (src_vars || dst_vars) ? -1 : count; +} + #if USE_CUDA_BSSN bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) { @@ -163,15 +175,17 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) bool cuda_direct_pack_segment(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, - int state_index) + int state_count) { + if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); - const bool ok = bssn_cuda_pack_state_region_to_host_buffer(src->Bg, state_index, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; return ok; @@ -179,15 +193,17 @@ bool cuda_direct_pack_segment(double *buffer, bool cuda_direct_unpack_segment(double *buffer, const Parallel::gridseg *dst, - int state_index) + int state_count) { + if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); - const bool ok = bssn_cuda_unpack_state_region_from_host_buffer(dst->Bg, state_index, buffer, dst->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; if (sync_profile_enabled()) sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; return ok; @@ -3921,21 +3937,14 @@ int Parallel::data_packer(double *data, MyList *src, MyList

*varls, *varld; - - varls = VarLists; - varld = VarListd; - while (varls && varld) - { - varls = varls->next; - varld = varld->next; - } - - if (varls || varld) - { - cout << "error in short data packer, var lists does not match." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MyList *varls, *varld; + + const int state_count = cuda_state_var_count(VarLists, VarListd); + if (state_count < 0) + { + cout << "error in short data packer, var lists does not match." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } int type; /* 1 copy, 2 restrict, 3 prolong */ if (src->data->Bg->lev == dst->data->Bg->lev) @@ -3961,7 +3970,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { - handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_idx); + handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); if (!handled_by_cuda) { cout << "Parallel::data_packer: CUDA direct pack failed." << endl; @@ -3970,7 +3979,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) { - handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_idx); + handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count); if (!handled_by_cuda) { cout << "Parallel::data_packer: CUDA direct unpack failed." << endl; @@ -4005,6 +4014,16 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub); #if USE_CUDA_BSSN } + else + { + size_out += (state_count - 1) * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + while (varls->next && varld->next) + { + varls = varls->next; + varld = varld->next; + ++state_idx; + } + } #endif } size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index ecbac7a..9cda5f7 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -53,6 +53,12 @@ using namespace std; #if USE_CUDA_BSSN namespace { +static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20}; +static const int k_bssn_cuda_ga_state_indices[12] = { + 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13 +}; + bool fill_bssn_cuda_views(Block *cg, MyList *vars, double **host_views, double *propspeeds = nullptr, @@ -82,11 +88,48 @@ bool bssn_cuda_use_resident_sync(int lev) (void)lev; return false; #else - return lev == 0; + return true; #endif } -void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank) +bool bssn_cuda_sync_subset(Block *cg, + int subset_count, + const int *state_indices, + double **host_views, + bool upload) +{ + if (!cg || subset_count <= 0) + return true; + if (!bssn_cuda_has_resident_state(cg)) + return true; + if (upload) + return bssn_cuda_upload_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0; + return bssn_cuda_download_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0; +} + +bool bssn_cuda_sync_ga_fields(Block *cg, MyList *vars, bool upload) +{ + double *ga_fields[12]; + int idx = 0; + while (vars && idx < 12) + { + ga_fields[idx++] = cg->fgfs[vars->data->sgfn]; + vars = vars->next; + } + if (idx != 12) + return false; + return bssn_cuda_sync_subset(cg, 12, k_bssn_cuda_ga_state_indices, ga_fields, upload); +} + +bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool upload) +{ + double *bh_fields[3] = { + cg->fgfs[forx->sgfn], cg->fgfs[fory->sgfn], cg->fgfs[forz->sgfn] + }; + return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload); +} + +void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) { MyList *Pp = PatL; while (Pp) @@ -108,7 +151,32 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int cout << "CUDA resident state download failed" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - bssn_cuda_release_step_ctx(cg); + if (release_ctx) + bssn_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void bssn_cuda_sync_level_bh_fields(MyList *PatL, + int myrank, + var *forx, var *fory, var *forz) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false)) + { + cout << "CUDA BH state subset download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); } if (BP == Pp->data->ble) break; @@ -3187,6 +3255,24 @@ void bssn_class::Step(int lev, int YN) bool used_gpu_substep = false; bool used_gpu_resident_state = false; #if USE_CUDA_BSSN + if (use_cuda_resident_sync) + { + if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, false)) + { + cout << "CUDA predictor GA subset download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + f_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); + if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, true)) + { + cout << "CUDA predictor GA subset upload failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } { double *state_in[BSSN_CUDA_STATE_COUNT]; double *state_out[BSSN_CUDA_STATE_COUNT]; @@ -3206,7 +3292,7 @@ void bssn_class::Step(int lev, int YN) int keep_resident_state = use_cuda_resident_sync ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) - apply_enforce_ga = 1; + apply_enforce_ga = use_cuda_resident_sync ? 0 : 1; #endif #if (SommerType == 0) #ifndef WithShell @@ -3519,6 +3605,17 @@ void bssn_class::Step(int lev, int YN) #endif Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); +#if USE_CUDA_BSSN + const bool need_analysis_state_after_predictor = + (lev == a_lev) && (LastAnas + dT_lev >= AnasTime); + const bool need_bh_state_after_predictor = + (BH_num > 0) && (lev == GH->levels - 1); + if (use_cuda_resident_sync && need_analysis_state_after_predictor) + bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); + else if (use_cuda_resident_sync && need_bh_state_after_predictor) + bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz); +#endif + #ifdef WithShell // Complete non-blocking error reduction and check MPI_Wait(&err_req, MPI_STATUS_IGNORE); @@ -3609,6 +3706,24 @@ void bssn_class::Step(int lev, int YN) bool used_gpu_substep = false; bool used_gpu_resident_state = false; #if USE_CUDA_BSSN + if (use_cuda_resident_sync) + { + if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, false)) + { + cout << "CUDA corrector GA subset download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); + if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, true)) + { + cout << "CUDA corrector GA subset upload failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } { double *state_in[BSSN_CUDA_STATE_COUNT]; double *state_out[BSSN_CUDA_STATE_COUNT]; @@ -3628,9 +3743,9 @@ void bssn_class::Step(int lev, int YN) int keep_resident_state = use_cuda_resident_sync ? 1 : 0; int apply_enforce_ga = 0; #if (AGM == 0) - apply_enforce_ga = 1; + apply_enforce_ga = use_cuda_resident_sync ? 0 : 1; #elif (AGM == 1) - apply_enforce_ga = (iter_count == 3) ? 1 : 0; + apply_enforce_ga = (iter_count == 3 && !use_cuda_resident_sync) ? 1 : 0; #endif #if (SommerType == 0) #ifndef WithShell @@ -3993,6 +4108,11 @@ void bssn_class::Step(int lev, int YN) } #endif +#if USE_CUDA_BSSN + if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3) + bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1); +#endif + // swap time level if (iter_count < 3) { @@ -4046,7 +4166,7 @@ void bssn_class::Step(int lev, int YN) } #if USE_CUDA_BSSN if (use_cuda_resident_sync) - bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank); + bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); #endif #if (RPS == 0) // mesh refinement boundary part diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index abcad51..c8efa0a 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -362,6 +362,8 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs }; +__constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; + static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { 1, 1, 1, 1, 1, -1, @@ -395,19 +397,25 @@ struct StepContext { double *d_state_curr_mem; double *d_state_next_mem; double *d_matter_mem; + double *d_comm_mem; + double *h_comm_mem; std::array d_state0; std::array d_accum; std::array d_state_curr; std::array d_state_next; std::array d_matter; size_t cap_all; + size_t cap_comm; + bool h_comm_pinned; + size_t cap_h_comm; bool matter_ready; bool state_ready; StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), - d_matter_mem(nullptr), cap_all(0), + d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), + cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), matter_ready(false), state_ready(false) { d_state0.fill(nullptr); @@ -584,11 +592,65 @@ static void release_step_ctx(void *block_tag) { auto it = g_step_ctx.find(block_tag); if (it == g_step_ctx.end()) return; + if (it->second.d_comm_mem) { + cudaFree(it->second.d_comm_mem); + it->second.d_comm_mem = nullptr; + it->second.cap_comm = 0; + } + if (it->second.h_comm_mem) { + if (it->second.h_comm_pinned) cudaFreeHost(it->second.h_comm_mem); + else free(it->second.h_comm_mem); + it->second.h_comm_mem = nullptr; + it->second.h_comm_pinned = false; + it->second.cap_h_comm = 0; + } StepAllocation alloc = detach_step_allocation(it->second); recycle_step_allocation(alloc); g_step_ctx.erase(it); } +static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles) +{ + if (needed_doubles == 0) return nullptr; + if (ctx.cap_comm < needed_doubles) { + if (ctx.d_comm_mem) { + CUDA_CHECK(cudaFree(ctx.d_comm_mem)); + ctx.d_comm_mem = nullptr; + } + CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double))); + ctx.cap_comm = needed_doubles; + } + return ctx.d_comm_mem; +} + +static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles) +{ + if (needed_doubles == 0) return nullptr; + if (ctx.cap_h_comm < needed_doubles) { + if (ctx.h_comm_mem) { + if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem); + else free(ctx.h_comm_mem); + ctx.h_comm_mem = nullptr; + ctx.h_comm_pinned = false; + } + + const size_t bytes = needed_doubles * sizeof(double); + cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes); + if (err == cudaSuccess) { + ctx.h_comm_pinned = true; + } else { + ctx.h_comm_mem = (double *)malloc(bytes); + ctx.h_comm_pinned = false; + if (!ctx.h_comm_mem) { + fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes); + exit(EXIT_FAILURE); + } + } + ctx.cap_h_comm = needed_doubles; + } + return ctx.h_comm_mem; +} + static void upload_grid_params_if_needed(const GridParams &gp) { if (!g_gp_host_cache_valid || @@ -1681,7 +1743,7 @@ __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz; - lscale = ONE / cbrt(lscale); + lscale = ONE / pow(lscale, F1O3); lgxx *= lscale; lgxy *= lscale; @@ -3446,6 +3508,88 @@ static void download_state_outputs(double **state_host_out, size_t all) } } +__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int i0, int j0, int k0, + int sx, int sy, int sz, + int region_all, + int state_count, + int all) +{ + const size_t total = (size_t)region_all * (size_t)state_count; + for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += (size_t)blockDim.x * gridDim.x) + { + const int state_index = (int)(tid / (size_t)region_all); + const int local = (int)(tid - (size_t)state_index * region_all); + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst[tid] = src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, + const double * __restrict__ src, + int nx, int ny, + int i0, int j0, int k0, + int sx, int sy, int sz, + int region_all, + int state_count, + int all) +{ + const size_t total = (size_t)region_all * (size_t)state_count; + for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += (size_t)blockDim.x * gridDim.x) + { + const int state_index = (int)(tid / (size_t)region_all); + const int local = (int)(tid - (size_t)state_index * region_all); + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst_mem[(size_t)state_index * all + dst] = src[tid]; + } +} + +__global__ void kern_pack_state_subset(const double * __restrict__ src_mem, + double * __restrict__ dst, + int subset_count, + int all) +{ + const size_t total = (size_t)subset_count * (size_t)all; + for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += (size_t)blockDim.x * gridDim.x) + { + const int subset_slot = (int)(tid / (size_t)all); + const int state_index = d_subset_state_indices[subset_slot]; + const int src = (int)(tid - (size_t)subset_slot * all); + dst[tid] = src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem, + const double * __restrict__ src, + int subset_count, + int all) +{ + const size_t total = (size_t)subset_count * (size_t)all; + for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += (size_t)blockDim.x * gridDim.x) + { + const int subset_slot = (int)(tid / (size_t)all); + const int state_index = d_subset_state_indices[subset_slot]; + const int dst = (int)(tid - (size_t)subset_slot * all); + dst_mem[(size_t)state_index * all + dst] = src[tid]; + } +} + static void copy_state_region_cuda(void *block_tag, int state_index, double *host_state, @@ -3508,6 +3652,41 @@ static void copy_state_region_packed_cuda(void *block_tag, CUDA_CHECK(cudaMemcpy3D(&p)); } +static void copy_state_region_packed_batch_cuda(void *block_tag, + int state_count, + double *host_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + const size_t total_doubles = (size_t)state_count * (size_t)region_all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + + if (kind == cudaMemcpyDeviceToHost) { + kern_pack_state_region_batch<<>>( + ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + } else { + CUDA_CHECK(cudaMemcpy(d_comm, host_buffer, + total_doubles * sizeof(double), + cudaMemcpyHostToDevice)); + kern_unpack_state_region_batch<<>>( + ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + } +} + static void download_resident_state(void *block_tag, int *ex, double **state_host_out) { const size_t all = (size_t)ex[0] * ex[1] * ex[2]; @@ -3521,6 +3700,63 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } } +static void copy_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host, + cudaMemcpyKind kind) +{ + if (subset_count <= 0) return; + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, all); + int active_state_indices[BSSN_STATE_COUNT]; + double *active_state_host[BSSN_STATE_COUNT]; + int active_count = 0; + + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices[i]; + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue; + if (!state_host[i]) continue; + active_state_indices[active_count] = state_index; + active_state_host[active_count] = state_host[i]; + ++active_count; + } + if (active_count <= 0) return; + + const size_t total_doubles = (size_t)active_count * all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles); + + CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices, + (size_t)active_count * sizeof(int), + 0, cudaMemcpyHostToDevice)); + if (kind == cudaMemcpyDeviceToHost) { + kern_pack_state_subset<<>>( + ctx.d_state_curr_mem, d_comm, active_count, (int)all); + CUDA_CHECK(cudaMemcpy(h_comm, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < active_count; ++i) { + std::memcpy(active_state_host[i], + h_comm + (size_t)i * all, + bytes); + } + } else { + for (int i = 0; i < active_count; ++i) { + std::memcpy(h_comm + (size_t)i * all, + active_state_host[i], + bytes); + } + CUDA_CHECK(cudaMemcpy(d_comm, h_comm, + total_doubles * sizeof(double), + cudaMemcpyHostToDevice)); + kern_unpack_state_subset<<>>( + ctx.d_state_curr_mem, d_comm, active_count, (int)all); + } +} + static bool has_resident_state(void *block_tag) { auto it = g_step_ctx.find(block_tag); @@ -4186,6 +4422,66 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" +int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice); + return 0; +} + +extern "C" +int bssn_cuda_download_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" +int bssn_cuda_upload_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_in) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in, + cudaMemcpyHostToDevice); + return 0; +} + extern "C" int bssn_cuda_has_resident_state(void *block_tag) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 8d12c40..9f473e5 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -86,6 +86,32 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_download_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_out); + +int bssn_cuda_upload_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_in); + int bssn_cuda_has_resident_state(void *block_tag); void bssn_cuda_release_step_ctx(void *block_tag);