diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 311f4be..239574e 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList return false; if (z4c_cuda_has_resident_state(dst->Bg) == 0) return false; - if (type != 1 && VarListd) - { - double *view_ptrs[Z4C_CUDA_STATE_COUNT]; - if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs)) - return false; - if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0) - return false; - } + (void)VarListd; return true; #elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; @@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer, const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); #if USE_CUDA_Z4C && (ABEtype == 2) - const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[Z4C_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views); + const bool ok = have_views + ? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else double *views[BSSN_CUDA_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); @@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer, const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); #if USE_CUDA_Z4C && (ABEtype == 2) - const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[Z4C_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views); + const bool ok = have_views + ? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views( + dst->Bg, views, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else double *views[BSSN_CUDA_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); @@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer, { const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; bool ok = false; + double *views[Z4C_CUDA_STATE_COUNT]; double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views); const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); if (type == 1) { const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); - ok = z4c_cuda_pack_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + ok = have_views + ? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : z4c_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; } else if (type == 2) { int first_fine[3]; if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) return false; - ok = z4c_cuda_restrict_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine[0], first_fine[1], first_fine[2], - have_soa ? soa_flat : 0) == 0; + ok = have_views + ? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0 + : z4c_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0; } else if (type == 3) { int first_fine_ii[3], coarse_lb[3]; if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) return false; - ok = z4c_cuda_prolong_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], - coarse_lb[0], coarse_lb[1], coarse_lb[2], - have_soa ? soa_flat : 0) == 0; + ok = have_views + ? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0 + : z4c_cuda_prolong_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0; } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; @@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); - const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer( - dst->Bg, state_count, buffer, dst->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + double *views[Z4C_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views); + const bool ok = have_views + ? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views( + dst->Bg, views, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0 + : z4c_cuda_unpack_state_batch_from_device_buffer( + dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; if (sync_profile_enabled()) sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; return ok; @@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block, #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count == Z4C_CUDA_STATE_COUNT) { + double *views[Z4C_CUDA_STATE_COUNT]; double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views); const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat); if (dir == PACK) { if (type == 2) - return z4c_cuda_restrict_state_segments_to_device_buffer( - block, state_count, data, block->shape, segment_count, - meta.data(), have_soa ? soa_flat : 0) == 0; + return have_views + ? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0 + : z4c_cuda_restrict_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0; if (type == 3) - return z4c_cuda_prolong_state_segments_to_device_buffer( - block, state_count, data, block->shape, segment_count, - meta.data(), have_soa ? soa_flat : 0) == 0; - return z4c_cuda_pack_state_segments_to_device_buffer( - block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return have_views + ? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0 + : z4c_cuda_prolong_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0; + return have_views + ? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 + : z4c_cuda_pack_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; } - return z4c_cuda_unpack_state_segments_from_device_buffer( - block, state_count, data, block->shape, segment_count, meta.data()) == 0; + return have_views + ? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 + : z4c_cuda_unpack_state_segments_from_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; } #endif #if USE_CUDA_BSSN @@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); } - } + } if (dir == UNPACK) // from target data to corresponding grid { f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 57c9772..1c50e72 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -388,41 +388,57 @@ bool z4c_cuda_interp_bh_point_resident(MyList *PatL, if (z4c_cuda_has_resident_state(block) && block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) { - const int sx = ordn; - const int sy = ordn; - const int sz = ordn; - const int region_all = sx * sy * sz; - const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); - const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); - const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); - double *packed_fields = new double[3 * region_all]; var *vars[3] = {forx, fory, forz}; - for (int f = 0; f < 3; f++) + static int use_device_bh_interp = -1; + if (use_device_bh_interp < 0) { - if (z4c_cuda_pack_state_region_to_host_buffer(block, - k_z4c_cuda_bh_state_indices[f], - packed_fields + f * region_all, - block->shape, - i0, j0, k0, - sx, sy, sz) != 0) + const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE"); + use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0; + } + bool used_device_interp = false; + if (use_device_bh_interp) + { + double soa3[9]; + for (int f = 0; f < 3; f++) { - delete[] packed_fields; - cout << "CUDA Z4C BH tile download failed" << endl; + soa3[3 * f + 0] = vars[f]->SoA[0]; + soa3[3 * f + 1] = vars[f]->SoA[1]; + soa3[3 * f + 2] = vars[f]->SoA[2]; + } + used_device_interp = + (z4c_cuda_interp_state_point3(block, block->shape, + k_z4c_cuda_bh_state_indices[0], + k_z4c_cuda_bh_state_indices[1], + k_z4c_cuda_bh_state_indices[2], + block->X[0][0], block->X[1][0], block->X[2][0], + DH[0], DH[1], DH[2], + x, y, z, + interp_ordn, interp_sym, + soa3, shellf) == 0); + } + if (!used_device_interp) + { + double *shift_views[3] = { + block->fgfs[forx->sgfn], + block->fgfs[fory->sgfn], + block->fgfs[forz->sgfn]}; + if (z4c_cuda_download_state_subset(block, block->shape, 3, + k_z4c_cuda_bh_state_indices, + shift_views) != 0) + { + cout << "CUDA Z4C BH shift download failed" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - int tile_shape[3] = {sx, sy, sz}; - f_global_interp(tile_shape, - block->X[0] + i0, - block->X[1] + j0, - block->X[2] + k0, - packed_fields + f * region_all, - shellf[f], - x, y, z, - interp_ordn, - vars[f]->SoA, - interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forx->sgfn], shellf[0], + x, y, z, interp_ordn, forx->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[fory->sgfn], shellf[1], + x, y, z, interp_ordn, fory->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forz->sgfn], shellf[2], + x, y, z, interp_ordn, forz->SoA, interp_sym); } - delete[] packed_fields; } else { diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index ec25cca..72c47b1 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -378,6 +378,7 @@ static const int STAGE_SLOT_COUNT = static constexpr int BSSN_STATE_COUNT = Z4C_CUDA_STATE_COUNT; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; +static constexpr int Z4C_RESIDENT_BANK_COUNT = 4; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -454,6 +455,7 @@ struct StepContext { double *d_accum_mem; double *d_state_curr_mem; double *d_state_next_mem; + std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; double *h_comm_mem; @@ -462,36 +464,49 @@ struct StepContext { std::array d_state_curr; std::array d_state_next; std::array d_matter; - std::array resident_host; + std::array, Z4C_RESIDENT_BANK_COUNT> d_resident; + std::array, Z4C_RESIDENT_BANK_COUNT> resident_host; + std::array, Z4C_RESIDENT_BANK_COUNT> resident_host_clean; + std::array resident_age; + std::array resident_valid; size_t cap_all; size_t cap_comm; bool h_comm_pinned; size_t cap_h_comm; bool matter_ready; bool state_ready; - bool resident_host_valid; + int current_bank; + unsigned long long resident_clock; StepContext() : d_state0_mem(nullptr), d_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), + d_resident_mem{}, d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), - matter_ready(false), state_ready(false), resident_host_valid(false) + matter_ready(false), state_ready(false), + current_bank(-1), resident_clock(0) { + d_resident_mem.fill(nullptr); d_state0.fill(nullptr); d_accum.fill(nullptr); d_state_curr.fill(nullptr); d_state_next.fill(nullptr); d_matter.fill(nullptr); - resident_host.fill(nullptr); + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + d_resident[b].fill(nullptr); + resident_host[b].fill(nullptr); + resident_host_clean[b].fill(0); + } + resident_age.fill(0); + resident_valid.fill(false); } }; struct StepAllocation { double *d_state0_mem; double *d_accum_mem; - double *d_state_curr_mem; - double *d_state_next_mem; + std::array d_resident_mem; double *d_matter_mem; double *d_comm_mem; double *h_comm_mem; @@ -508,11 +523,17 @@ static size_t g_comm_segment_meta_cap = 0; static StepAllocation empty_step_allocation() { - StepAllocation alloc = { - nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr, - 0, 0, false, 0 - }; + StepAllocation alloc = {}; + alloc.d_state0_mem = nullptr; + alloc.d_accum_mem = nullptr; + alloc.d_resident_mem.fill(nullptr); + alloc.d_matter_mem = nullptr; + alloc.d_comm_mem = nullptr; + alloc.h_comm_mem = nullptr; + alloc.cap_all = 0; + alloc.cap_comm = 0; + alloc.h_comm_pinned = false; + alloc.cap_h_comm = 0; return alloc; } @@ -523,16 +544,22 @@ static bool has_step_allocation(const StepAllocation &alloc) static StepAllocation detach_step_allocation(StepContext &ctx) { - StepAllocation alloc = { - ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem, - ctx.d_state_next_mem, ctx.d_matter_mem, - ctx.d_comm_mem, ctx.h_comm_mem, - ctx.cap_all, ctx.cap_comm, ctx.h_comm_pinned, ctx.cap_h_comm - }; + StepAllocation alloc = {}; + alloc.d_state0_mem = ctx.d_state0_mem; + alloc.d_accum_mem = ctx.d_accum_mem; + alloc.d_resident_mem = ctx.d_resident_mem; + alloc.d_matter_mem = ctx.d_matter_mem; + alloc.d_comm_mem = ctx.d_comm_mem; + alloc.h_comm_mem = ctx.h_comm_mem; + alloc.cap_all = ctx.cap_all; + alloc.cap_comm = ctx.cap_comm; + alloc.h_comm_pinned = ctx.h_comm_pinned; + alloc.cap_h_comm = ctx.cap_h_comm; ctx.d_state0_mem = nullptr; ctx.d_accum_mem = nullptr; ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; + ctx.d_resident_mem.fill(nullptr); ctx.d_matter_mem = nullptr; ctx.d_comm_mem = nullptr; ctx.h_comm_mem = nullptr; @@ -542,13 +569,20 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.cap_h_comm = 0; ctx.matter_ready = false; ctx.state_ready = false; + ctx.current_bank = -1; + ctx.resident_clock = 0; ctx.d_state0.fill(nullptr); ctx.d_accum.fill(nullptr); ctx.d_state_curr.fill(nullptr); ctx.d_state_next.fill(nullptr); ctx.d_matter.fill(nullptr); - ctx.resident_host.fill(nullptr); - ctx.resident_host_valid = false; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + ctx.d_resident[b].fill(nullptr); + ctx.resident_host[b].fill(nullptr); + ctx.resident_host_clean[b].fill(0); + } + ctx.resident_age.fill(0); + ctx.resident_valid.fill(false); return alloc; } @@ -556,8 +590,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc { ctx.d_state0_mem = alloc.d_state0_mem; ctx.d_accum_mem = alloc.d_accum_mem; - ctx.d_state_curr_mem = alloc.d_state_curr_mem; - ctx.d_state_next_mem = alloc.d_state_next_mem; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_next_mem = nullptr; + ctx.d_resident_mem = alloc.d_resident_mem; ctx.d_matter_mem = alloc.d_matter_mem; ctx.d_comm_mem = alloc.d_comm_mem; ctx.h_comm_mem = alloc.h_comm_mem; @@ -567,8 +602,14 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.cap_h_comm = alloc.cap_h_comm; ctx.matter_ready = false; ctx.state_ready = false; - ctx.resident_host.fill(nullptr); - ctx.resident_host_valid = false; + ctx.current_bank = -1; + ctx.resident_clock = 0; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + ctx.resident_host[b].fill(nullptr); + ctx.resident_host_clean[b].fill(0); + } + ctx.resident_age.fill(0); + ctx.resident_valid.fill(false); } static void recycle_step_allocation(StepAllocation &alloc) @@ -657,8 +698,10 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) if (!has_step_allocation(alloc)) { CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b], + BSSN_STATE_COUNT * all * sizeof(double))); + } CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); alloc.cap_all = all; } @@ -667,8 +710,13 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; - ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all; - ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all; + } + } + if (ctx.current_bank >= 0) { + ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank]; + ctx.d_state_curr = ctx.d_resident[ctx.current_bank]; } for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; @@ -5467,6 +5515,25 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem, } } +static bool resident_key_usable(double **host_key); +static int find_resident_bank(const StepContext &ctx, double **host_key); +static void set_resident_host_clean(StepContext &ctx, int bank, bool clean); +static bool resident_host_subset_clean(const StepContext &ctx, + int bank, + int subset_count, + const int *state_indices); +static void mark_resident_host_subset_clean(StepContext &ctx, + int bank, + int subset_count, + const int *state_indices, + bool clean); +static void mark_resident_current_bank(StepContext &ctx, int bank); +static void update_state_ready(StepContext &ctx); +static int active_or_keyed_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing); + static void copy_state_region_cuda(void *block_tag, int state_index, double *host_state, @@ -5494,6 +5561,12 @@ static void copy_state_region_cuda(void *block_tag, p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); } CUDA_CHECK(cudaMemcpy3D(&p)); + if (kind == cudaMemcpyHostToDevice && ctx.current_bank >= 0) { + ctx.resident_valid[ctx.current_bank] = true; + ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock; + ctx.resident_host_clean[ctx.current_bank][state_index] = 0; + update_state_ready(ctx); + } } static void copy_state_region_packed_cuda(void *block_tag, @@ -5527,6 +5600,12 @@ static void copy_state_region_packed_cuda(void *block_tag, } CUDA_CHECK(cudaMemcpy3D(&p)); + if (kind == cudaMemcpyHostToDevice && ctx.current_bank >= 0) { + ctx.resident_valid[ctx.current_bank] = true; + ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock; + ctx.resident_host_clean[ctx.current_bank][state_index] = 0; + update_state_ready(ctx); + } } static void copy_state_region_packed_batch_cuda(void *block_tag, @@ -5565,6 +5644,12 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, ctx.d_state_curr_mem, d_comm, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); + if (ctx.current_bank >= 0) { + ctx.resident_valid[ctx.current_bank] = true; + ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock; + mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false); + update_state_ready(ctx); + } } } @@ -5594,7 +5679,14 @@ static void copy_state_region_packed_batch_device_cuda(void *block_tag, ctx.d_state_curr_mem, device_buffer, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); - ctx.state_ready = true; + if (ctx.current_bank >= 0) { + ctx.resident_valid[ctx.current_bank] = true; + ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock; + mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false); + mark_resident_current_bank(ctx, ctx.current_bank); + } else { + ctx.state_ready = true; + } } } @@ -5636,7 +5728,14 @@ static void copy_state_device_segments(void *block_tag, ctx.d_state_curr_mem, device_buffer, ex[0], ex[1], d_meta, state_count, ex[0] * ex[1] * ex[2]); - ctx.state_ready = true; + if (ctx.current_bank >= 0) { + ctx.resident_valid[ctx.current_bank] = true; + ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock; + mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false); + mark_resident_current_bank(ctx, ctx.current_bank); + } else { + ctx.state_ready = true; + } } } @@ -5715,6 +5814,14 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); + int bank = find_resident_bank(ctx, state_host_out); + if (bank < 0) + bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false); + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank]) + return; + mark_resident_current_bank(ctx, bank); + if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + return; static int direct_download = -1; if (direct_download < 0) { const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD"); @@ -5722,18 +5829,20 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } if (direct_download) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { - CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i], + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); + set_resident_host_clean(ctx, bank, true); return; } - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToHost)); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } + set_resident_host_clean(ctx, bank, true); } static void copy_state_subset(void *block_tag, @@ -5764,14 +5873,26 @@ static void copy_state_subset(void *block_tag, const size_t total_doubles = (size_t)active_count * all; double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles); + double *full_key[BSSN_STATE_COUNT]; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) + full_key[i] = nullptr; + for (int i = 0; i < active_count; ++i) + full_key[active_state_indices[i]] = active_state_host[i]; + int bank = active_or_keyed_bank(ctx, nullptr, all, kind == cudaMemcpyHostToDevice); + if (kind == cudaMemcpyHostToDevice && resident_key_usable(full_key)) + bank = active_or_keyed_bank(ctx, full_key, all, false); + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) + return; CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices, (size_t)active_count * sizeof(int), 0, cudaMemcpyHostToDevice)); if (kind == cudaMemcpyDeviceToHost) { + if (resident_host_subset_clean(ctx, bank, active_count, active_state_indices)) + return; dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); kern_pack_state_subset<<>>( - ctx.d_state_curr_mem, d_comm, active_count, (int)all); + ctx.d_resident_mem[bank], d_comm, active_count, (int)all); CUDA_CHECK(cudaMemcpy(h_comm, d_comm, total_doubles * sizeof(double), cudaMemcpyDeviceToHost)); @@ -5780,6 +5901,7 @@ static void copy_state_subset(void *block_tag, h_comm + (size_t)i * all, bytes); } + mark_resident_host_subset_clean(ctx, bank, active_count, active_state_indices, true); } else { for (int i = 0; i < active_count; ++i) { std::memcpy(h_comm + (size_t)i * all, @@ -5791,14 +5913,22 @@ static void copy_state_subset(void *block_tag, cudaMemcpyHostToDevice)); dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); kern_unpack_state_subset<<>>( - ctx.d_state_curr_mem, d_comm, active_count, (int)all); + ctx.d_resident_mem[bank], d_comm, active_count, (int)all); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, bank); + mark_resident_host_subset_clean(ctx, bank, active_count, active_state_indices, true); } } static bool has_resident_state(void *block_tag) { auto it = g_step_ctx.find(block_tag); - return it != g_step_ctx.end() && it->second.state_ready; + if (it == g_step_ctx.end()) return false; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (it->second.resident_valid[b]) return true; + } + return false; } static bool resident_key_usable(double **host_key) @@ -5810,26 +5940,375 @@ static bool resident_key_usable(double **host_key) return true; } -static bool resident_key_matches(const StepContext &ctx, double **host_key) +static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key) { - if (!ctx.state_ready || !ctx.resident_host_valid || !resident_key_usable(host_key)) + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !resident_key_usable(host_key)) return false; for (int i = 0; i < BSSN_STATE_COUNT; ++i) { - if (ctx.resident_host[i] != host_key[i]) return false; + if (ctx.resident_host[bank][i] != host_key[i]) return false; } return true; } -static void set_resident_key(StepContext &ctx, double **host_key) +static int find_resident_bank(const StepContext &ctx, double **host_key) +{ + if (!host_key) return -1; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (resident_key_matches(ctx, b, host_key)) + return b; + } + return -1; +} + +static void set_resident_host_clean(StepContext &ctx, int bank, bool clean) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return; + ctx.resident_host_clean[bank].fill(clean ? 1 : 0); +} + +static bool resident_host_subset_clean(const StepContext &ctx, + int bank, + int subset_count, + const int *state_indices) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return false; + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices ? state_indices[i] : i; + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) + return false; + if (!ctx.resident_host_clean[bank][state_index]) + return false; + } + return true; +} + +static void mark_resident_host_subset_clean(StepContext &ctx, + int bank, + int subset_count, + const int *state_indices, + bool clean) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return; + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices ? state_indices[i] : i; + if (state_index >= 0 && state_index < BSSN_STATE_COUNT) + ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0; + } +} + +static void mark_resident_current_bank(StepContext &ctx, int bank) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return; + ctx.current_bank = bank; + ctx.d_state_curr_mem = ctx.d_resident_mem[bank]; + ctx.d_state_curr = ctx.d_resident[bank]; + ctx.state_ready = ctx.resident_valid[bank]; +} + +static void mark_resident_next_bank(StepContext &ctx, int bank) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return; + ctx.d_state_next_mem = ctx.d_resident_mem[bank]; + ctx.d_state_next = ctx.d_resident[bank]; +} + +static bool any_resident_bank_valid(const StepContext &ctx) +{ + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (ctx.resident_valid[b]) return true; + } + return false; +} + +static void update_state_ready(StepContext &ctx) +{ + ctx.state_ready = any_resident_bank_valid(ctx); +} + +static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return; + if (!ctx.resident_valid[bank]) return; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) + if (!ctx.resident_host[bank][i]) return; + if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + return; + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i], + ctx.d_resident[bank][i], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); + set_resident_host_clean(ctx, bank, true); +} + +static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) +{ + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (b != avoid_bank && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (b == avoid_bank) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) best = 0; + writeback_resident_bank(ctx, best, all); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + +static void assign_resident_key(StepContext &ctx, int bank, double **host_key) +{ + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !resident_key_usable(host_key)) + return; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) + ctx.resident_host[bank][i] = host_key[i]; + set_resident_host_clean(ctx, bank, false); + ctx.resident_age[bank] = ++ctx.resident_clock; +} + +static int ensure_resident_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_bank = -1) { if (!resident_key_usable(host_key)) { - ctx.resident_host.fill(nullptr); - ctx.resident_host_valid = false; - return; + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; } - for (int i = 0; i < BSSN_STATE_COUNT; ++i) - ctx.resident_host[i] = host_key[i]; - ctx.resident_host_valid = true; + + int bank = find_resident_bank(ctx, host_key); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all); + assign_resident_key(ctx, bank, host_key); + if (upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + +static int reserve_resident_output_bank(StepContext &ctx, + double **host_key, + size_t all, + int input_bank) +{ + if (!resident_key_usable(host_key)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches(ctx, input_bank, host_key)) + return input_bank; + + int bank = find_resident_bank(ctx, host_key); + if (bank < 0) + bank = choose_resident_bank_for_reuse(ctx, input_bank, all); + assign_resident_key(ctx, bank, host_key); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + +static int active_or_keyed_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing) +{ + if (resident_key_usable(host_key)) { + const int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing); + mark_resident_current_bank(ctx, bank); + return bank; + } + if (ctx.current_bank >= 0) + return ctx.current_bank; + for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) { + if (ctx.resident_valid[b]) { + mark_resident_current_bank(ctx, b); + return b; + } + } + return 0; +} + +__device__ double interp_lagrange_weight(int idx, double x, int ordn) +{ + double w = 1.0; + const double xi = (double)idx; + for (int j = 0; j < ordn; ++j) { + if (j == idx) continue; + w *= (x - (double)j) / (xi - (double)j); + } + return w; +} + +__device__ void interp_axis_window(double p, + double x0, + double dx, + int n, + int ordn, + int symmetry, + int axis, + int &base, + double &local_x) +{ + int cx_i = (int)((p - x0) / dx + 0.4) + 1; + int cx_b = cx_i - ordn / 2 + 1; + int cx_t = cx_b + ordn - 1; + int cmin = 1; + if (symmetry == 2 && axis < 2 && fabs(x0) < dx) + cmin = -ordn / 2 + 1; + if (symmetry != 0 && axis == 2 && fabs(x0) < dx) + cmin = -ordn / 2 + 1; + + if (cx_b < cmin) { + cx_b = cmin; + cx_t = cx_b + ordn - 1; + } + if (cx_t > n) { + cx_t = n; + cx_b = cx_t + 1 - ordn; + } + + base = cx_b; + if (cx_b > 0) { + const double xb = x0 + (double)(cx_b - 1) * dx; + local_x = (p - xb) / dx; + } else { + const int reflected = 1 - cx_b; + const double xb = x0 + (double)(reflected - 1) * dx; + local_x = (p + xb) / dx; + } +} + +__device__ double load_interp_value(const double * __restrict__ mem, + int nx, + int ny, + int nz, + int all, + int state, + int fi, + int fj, + int fk, + const double * __restrict__ soa) +{ + double sign = 1.0; + int ii = fi; + int jj = fj; + int kk = fk; + if (ii <= 0) { + ii = 1 - ii; + sign *= soa[0]; + } + if (jj <= 0) { + jj = 1 - jj; + sign *= soa[1]; + } + if (kk <= 0) { + kk = 1 - kk; + sign *= soa[2]; + } + if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz) + return 0.0; + const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny; + return sign * mem[(size_t)state * (size_t)all + (size_t)idx]; +} + +__global__ void kern_interp_state_point3(const double * __restrict__ mem, + double * __restrict__ out, + int nx, + int ny, + int nz, + int all, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + double soa00, double soa01, double soa02, + double soa10, double soa11, double soa12, + double soa20, double soa21, double soa22) +{ + const int f = threadIdx.x; + if (f >= 3 || ordn <= 0 || ordn > 8) + return; + + const int states[3] = {state0, state1, state2}; + const double soa_all[9] = { + soa00, soa01, soa02, + soa10, soa11, soa12, + soa20, soa21, soa22 + }; + const double *soa = soa_all + 3 * f; + + int ib, jb, kb; + double tx, ty, tz; + interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx); + interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty); + interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz); + + double wx[8], wy[8], wz[8]; + for (int i = 0; i < ordn; ++i) { + wx[i] = interp_lagrange_weight(i, tx, ordn); + wy[i] = interp_lagrange_weight(i, ty, ordn); + wz[i] = interp_lagrange_weight(i, tz, ordn); + } + + double value = 0.0; + for (int k = 0; k < ordn; ++k) { + for (int j = 0; j < ordn; ++j) { + for (int i = 0; i < ordn; ++i) { + const double coeff = wx[i] * wy[j] * wz[k]; + value += coeff * load_interp_value(mem, nx, ny, nz, all, + states[f], + ib + i, jb + j, kb + k, + soa); + } + } + } + out[f] = value; } @@ -7788,8 +8267,6 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, double &chitiny) { (void)T; - (void)state_host_out; - if (RK4 < 0 || RK4 > 3) return 1; using namespace z4c_cuda; @@ -7820,23 +8297,27 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, all); const bool use_resident_state = (keep_resident_state != 0); + int input_bank = -1; + int output_bank = -1; if (use_resident_state) { - bind_state_input_slots(ctx.d_state_curr); - bind_state_output_slots(ctx.d_state_next); + input_bank = ensure_resident_bank(ctx, state_host_in, all, true); + output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank); + mark_resident_current_bank(ctx, input_bank); + mark_resident_next_bank(ctx, output_bank); + bind_state_input_slots(ctx.d_resident[input_bank]); + bind_state_output_slots(ctx.d_resident[output_bank]); } double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (!use_resident_state || !ctx.state_ready) { + if (!use_resident_state) { upload_state_inputs(state_host_in, all); - if (use_resident_state) { - ctx.state_ready = true; - set_resident_key(ctx, state_host_in); - } } if (apply_enforce_ga) { kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + if (use_resident_state && input_bank >= 0) + set_resident_host_clean(ctx, input_bank, false); } if (RK4 == 0) { CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], @@ -7887,10 +8368,10 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, t0 = profile ? cuda_profile_now_ms() : 0.0; if (use_resident_state) { - std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem); - ctx.d_state_curr.swap(ctx.d_state_next); - ctx.state_ready = true; - set_resident_key(ctx, state_host_out); + ctx.resident_valid[output_bank] = true; + ctx.resident_age[output_bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, output_bank, false); + mark_resident_current_bank(ctx, output_bank); } else { download_state_outputs(state_host_out, all); } @@ -7931,6 +8412,8 @@ extern "C" int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost); @@ -7947,6 +8430,8 @@ extern "C" int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice); @@ -7963,6 +8448,27 @@ extern "C" int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost); @@ -7979,6 +8485,29 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = ensure_resident_bank(ctx, state_host_key, + (size_t)ex[0] * ex[1] * ex[2], false); + mark_resident_current_bank(ctx, bank); copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice); @@ -7995,6 +8524,26 @@ extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 1); + return 0; +} + +extern "C" int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, i0, j0, k0, sx, sy, sz, 1); return 0; @@ -8010,6 +8559,28 @@ extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, + i0, j0, k0, sx, sy, sz, 0); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = ensure_resident_bank(ctx, state_host_key, + (size_t)ex[0] * ex[1] * ex[2], false); + mark_resident_current_bank(ctx, bank); copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex, i0, j0, k0, sx, sy, sz, 0); return 0; @@ -8025,6 +8596,26 @@ extern "C" int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 1); + return 0; +} + +extern "C" int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); copy_state_device_segments(block_tag, state_count, device_buffer, ex, segment_count, segment_meta, 1); return 0; @@ -8040,6 +8631,28 @@ extern "C" int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + copy_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, 0); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int bank = ensure_resident_bank(ctx, state_host_key, + (size_t)ex[0] * ex[1] * ex[2], false); + mark_resident_current_bank(ctx, bank); copy_state_device_segments(block_tag, state_count, device_buffer, ex, segment_count, segment_meta, 0); return 0; @@ -8056,6 +8669,27 @@ extern "C" int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + restrict_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_soa); + return 0; +} + +extern "C" int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); restrict_state_device_segments(block_tag, state_count, device_buffer, ex, segment_count, segment_meta, state_soa); return 0; @@ -8072,6 +8706,27 @@ extern "C" int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, using namespace z4c_cuda; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + prolong_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_soa); + return 0; +} + +extern "C" int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); prolong_state_device_segments(block_tag, state_count, device_buffer, ex, segment_count, segment_meta, state_soa); return 0; @@ -8091,6 +8746,35 @@ extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); const int region_all = sx * sy * sz; upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -8118,6 +8802,37 @@ extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true); const int region_all = sx * sy * sz; upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -8169,11 +8884,66 @@ extern "C" int z4c_cuda_compute_constraints_resident(void *block_tag, CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); if (!block_tag || !ex || !constraint_host_out) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); - if (!ctx.state_ready) return 1; + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, nullptr, all, false); + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank]) return 1; setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); bind_state_input_slots(ctx.d_state_curr); - launch_z4c_rhs_pipeline((int)((size_t)ex[0] * ex[1] * ex[2]), eps); - download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]); + launch_z4c_rhs_pipeline((int)all, eps); + download_constraint_outputs(constraint_host_out, all); + return 0; +} + +extern "C" int z4c_cuda_interp_state_point3(void *block_tag, + int *ex, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + const double *soa3, + double *out3) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!block_tag || !ex || !out3 || !soa3) + return 1; + if (state0 < 0 || state0 >= BSSN_STATE_COUNT || + state1 < 0 || state1 >= BSSN_STATE_COUNT || + state2 < 0 || state2 >= BSSN_STATE_COUNT) + return 1; + if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 || + ordn <= 0 || ordn > 8 || + ex[0] < ordn || ex[1] < ordn || ex[2] < ordn) + return 1; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, nullptr, all, false); + if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank]) + return 1; + + double *d_out = ensure_step_comm_buffer(ctx, 3); + kern_interp_state_point3<<<1, 3>>>( + ctx.d_resident_mem[bank], d_out, + ex[0], ex[1], ex[2], (int)all, + state0, state1, state2, + x0, y0, z0, dx, dy, dz, + px, py, pz, ordn, symmetry, + soa3[0], soa3[1], soa3[2], + soa3[3], soa3[4], soa3[5], + soa3[6], soa3[7], soa3[8]); + CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost)); return 0; } @@ -8204,7 +8974,8 @@ extern "C" int z4c_cuda_resident_state_matches(void *block_tag, CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); auto it = g_step_ctx.find(block_tag); if (it == g_step_ctx.end()) return 0; - return resident_key_matches(it->second, state_host_key) ? 1 : 0; + const int bank = find_resident_bank(it->second, state_host_key); + return (bank >= 0 && it->second.resident_valid[bank]) ? 1 : 0; } extern "C" void z4c_cuda_release_step_ctx(void *block_tag) diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index 2f81925..4a9242d 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -53,6 +53,14 @@ int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int state_count, double *host_buffer, @@ -60,6 +68,14 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -67,6 +83,14 @@ int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -74,6 +98,14 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -81,6 +113,14 @@ int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, int segment_count, const int *segment_meta); +int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -88,6 +128,14 @@ int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int segment_count, const int *segment_meta); +int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -96,6 +144,15 @@ int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag, const int *segment_meta, const double *state_soa); +int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -104,6 +161,15 @@ int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, const int *segment_meta, const double *state_soa); +int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -112,6 +178,15 @@ int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int fi0, int fj0, int fk0, const double *state_soa); +int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa); + int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -121,6 +196,16 @@ int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int lbc_i, int lbc_j, int lbc_k, const double *state_soa); +int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); + int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, @@ -138,6 +223,25 @@ int z4c_cuda_compute_constraints_resident(void *block_tag, int Symmetry, double eps, int co, double **constraint_host_out); +int z4c_cuda_interp_state_point3(void *block_tag, + int *ex, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + const double *soa3, + double *out3); + int z4c_cuda_download_constraint_outputs(int *ex, double **constraint_host_out);