From ae64a221780e41e9876ee0f0858cec35a865ded2 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Tue, 5 May 2026 23:57:42 +0800 Subject: [PATCH] Complete BSSN-EScalar CUDA resident transfers --- AMSS_NCKU_source/Parallel.C | 385 +++++++++++++++++++++++--- AMSS_NCKU_source/bssnEScalar_class.C | 182 ++++++++++++- AMSS_NCKU_source/bssn_class.C | 79 +++++- AMSS_NCKU_source/bssn_rhs_cuda.cu | 394 +++++++++++++++++++++++++-- AMSS_NCKU_source/bssn_rhs_cuda.h | 27 ++ 5 files changed, 995 insertions(+), 72 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 8c7d3fd..bc37930 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -329,6 +329,19 @@ bool cuda_state_count_direct_supported(int state_count) #endif } +#if USE_CUDA_BSSN +bool cuda_prepare_inter_time_device_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_PREPARE_INTER_DEVICE"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + return enabled != 0; +} +#endif + bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) { if (!src || !dst || !src->Bg) @@ -526,6 +539,26 @@ bool cuda_cached_device_buffers_enabled(int state_count) return cuda_aware_mpi_enabled(); } +bool cuda_uncached_device_buffers_enabled(int state_count) +{ +#if USE_CUDA_BSSN + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_UNCACHED_DEVICE_BUFFERS"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + if (!enabled) + return false; + if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT) + return false; + return cuda_aware_mpi_enabled(); +#else + (void)state_count; + return false; +#endif +} + bool cuda_amr_restrict_device_enabled() { static int enabled = -1; @@ -570,6 +603,17 @@ bool cuda_amr_restrict_compare_enabled() return enabled != 0; } +bool cuda_amr_prolong_compare_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_PROLONG_COMPARE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool cuda_amr_restrict_batch_enabled() { static int enabled = -1; @@ -673,6 +717,57 @@ void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length) caps[idx] = length; } +struct UncachedDeviceBuffers +{ + int cpusize; + double **send_bufs; + double **recv_bufs; + int *send_caps; + int *recv_caps; + + UncachedDeviceBuffers() + : cpusize(0), send_bufs(0), recv_bufs(0), send_caps(0), recv_caps(0) + { + } +}; + +UncachedDeviceBuffers &uncached_device_buffers() +{ + static UncachedDeviceBuffers buffers; + return buffers; +} + +void ensure_uncached_device_buffers(int cpusize) +{ + UncachedDeviceBuffers &buffers = uncached_device_buffers(); + if (buffers.cpusize == cpusize && buffers.send_bufs && buffers.recv_bufs) + return; + for (int i = 0; i < buffers.cpusize; ++i) + { + if (buffers.send_bufs && buffers.send_bufs[i]) + free_device_comm_buffer(buffers.send_bufs[i]); + if (buffers.recv_bufs && buffers.recv_bufs[i]) + free_device_comm_buffer(buffers.recv_bufs[i]); + } + delete[] buffers.send_bufs; + delete[] buffers.recv_bufs; + delete[] buffers.send_caps; + delete[] buffers.recv_caps; + + buffers.cpusize = cpusize; + buffers.send_bufs = new double *[cpusize]; + buffers.recv_bufs = new double *[cpusize]; + buffers.send_caps = new int[cpusize]; + buffers.recv_caps = new int[cpusize]; + for (int i = 0; i < cpusize; ++i) + { + buffers.send_bufs[i] = 0; + buffers.recv_bufs[i] = 0; + buffers.send_caps[i] = 0; + buffers.recv_caps[i] = 0; + } +} + bool cuda_direct_pack_segment_to_device(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, @@ -849,10 +944,109 @@ bool cuda_direct_pack_segment_to_device(double *buffer, coarse_lb[0], coarse_lb[1], coarse_lb[2], have_soa ? soa_flat : 0) == 0 : bssn_cuda_prolong_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0; + if (ok && cuda_amr_prolong_compare_enabled()) + { + const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; + const int total = state_count * region_all; + double *cpu = new double[total]; + double *gpu = new double[total]; + if (!cuda_download_resident_subset_to_host(src->Bg, VarLists, state_count)) + { + delete[] cpu; + delete[] gpu; + return false; + } + int DIM = dim; + MyList *v = VarLists; + for (int s = 0; s < state_count && v; ++s, v = v->next) + { + f_prolong3(DIM, + src->Bg->bbox, + src->Bg->bbox + dim, + src->Bg->shape, + src->Bg->fgfs[v->data->sgfn], + const_cast(dst->llb), + const_cast(dst->uub), + const_cast(dst->shape), + cpu + (size_t)s * region_all, + const_cast(dst->llb), + const_cast(dst->uub), + v->data->SoA, + Symmetry); + } + cudaError_t cerr = cudaMemcpy(gpu, buffer, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost); + if (cerr != cudaSuccess) + { + fprintf(stderr, "Parallel: prolong compare cudaMemcpy failed, err=%d\n", (int)cerr); + delete[] cpu; + delete[] gpu; + return false; + } + double max_abs = 0.0; + double max_rel = 0.0; + int max_idx = -1; + for (int i = 0; i < total; ++i) + { + const double diff = fabs(cpu[i] - gpu[i]); + const double den = fmax(fabs(cpu[i]), fabs(gpu[i])); + const double rel = den > 0.0 ? diff / den : diff; + if (diff > max_abs) + { + max_abs = diff; + max_rel = rel; + max_idx = i; + } + } + static int report_count = 0; + const double tol = cuda_amr_restrict_compare_tol(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (max_abs > tol || max_rel > tol) + { + const int state = max_idx / region_all; + const int local = max_idx - state * region_all; + const int ii = local % dst->shape[0]; + const int jj = (local / dst->shape[0]) % dst->shape[1]; + const int kk = local / (dst->shape[0] * dst->shape[1]); + if (report_count < cuda_amr_restrict_compare_limit()) + { + fprintf(stderr, + "[AMSS-CUDA-PROLONG-CMP][rank %d] mismatch state=%d point=(%d,%d,%d) " + "shape=(%d,%d,%d) first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) " + "max_abs=%.17e max_rel=%.17e cpu=%.17e gpu=%.17e src_lev=%d dst_lev=%d\n", + rank, state, ii, jj, kk, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + max_abs, max_rel, cpu[max_idx], gpu[max_idx], + src->Bg->lev, dst->Bg->lev); + fflush(stderr); + report_count++; + } + delete[] cpu; + delete[] gpu; + return false; + } + else if (report_count < cuda_amr_restrict_compare_limit()) + { + fprintf(stderr, + "[AMSS-CUDA-PROLONG-CMP][rank %d] ok shape=(%d,%d,%d) " + "first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) max_abs=%.17e max_rel=%.17e\n", + rank, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + max_abs, max_rel); + fflush(stderr); + report_count++; + } + delete[] cpu; + delete[] gpu; + } } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; @@ -941,22 +1135,34 @@ bool cuda_download_resident_subset_to_host(Block *block, bool cuda_unpack_host_region_to_resident(Block *block, int state_index, double *buffer, - const Parallel::gridseg *dst) + const Parallel::gridseg *dst, + MyList *vars, + int state_count) { #if USE_CUDA_BSSN - if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT) + if (!block || !dst || !vars || state_count <= 0 || + state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT || + state_index < 0 || state_index >= state_count) return false; if (bssn_cuda_has_resident_state(block) == 0) return true; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + MyList *v = vars; + for (int i = 0; i < state_count; ++i) + { + if (!v) + return false; + views[i] = block->fgfs[v->data->sgfn]; + v = v->next; + } const int i0 = cuda_seg_begin(dst, block, 0); const int j0 = cuda_seg_begin(dst, block, 1); const int k0 = cuda_seg_begin(dst, block, 2); - return bssn_cuda_unpack_state_region_from_host_buffer( - block, state_index, buffer, block->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + return bssn_cuda_unpack_state_region_from_host_buffer_for_host_views( + block, views, state_count, state_index, buffer, block->shape, + i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else - (void)block; (void)state_index; (void)buffer; (void)dst; + (void)block; (void)state_index; (void)buffer; (void)dst; (void)vars; (void)state_count; return false; #endif } @@ -5178,8 +5384,9 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) { - if (type != 2 && type != 3 && - !cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) + if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, + data + size_out, dst->data, + VarListd, state_count)) { cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -5318,13 +5525,40 @@ void Parallel::transfer(MyList **src, MyList **src, MyList 0) { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) + if (recv_is_dev[node]) { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, node, recv_lengths[node]); + MPI_Irecv((void *)dev_buffers.recv_bufs[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + } + else + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 1; req_no++; @@ -5351,13 +5594,22 @@ void Parallel::transfer(MyList **src, MyList 0) { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) + if (recv_is_dev[myrank]) { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, myrank, recv_lengths[myrank]); + data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + else + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); } - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); } // Pack and post sends. @@ -5368,19 +5620,51 @@ void Parallel::transfer(MyList **src, MyList 0) { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) + if (send_is_dev[node]) { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + ensure_device_comm_buffer(dev_buffers.send_bufs, dev_buffers.send_caps, node, send_lengths[node]); + data_packer_with_device_buffer(dev_buffers.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); } - data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + else + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + send_req_index[node] = req_no; req_node[req_no] = node; req_is_recv[req_no] = 0; req_no++; } } +#if USE_CUDA_BSSN || USE_CUDA_Z4C + if (cuda_device_sends > 0) + cudaDeviceSynchronize(); +#endif + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + if (send_lengths[node] > 0) + { + const int req_idx = send_req_index[node]; + if (req_idx < 0) + continue; + if (send_is_dev[node]) + { + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + MPI_Isend((void *)dev_buffers.send_bufs[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx); + } + else + { + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx); + } + } + } // Unpack as soon as receive completes to reduce pure wait time. while (pending_recv > 0) @@ -5395,7 +5679,15 @@ void Parallel::transfer(MyList **src, MyList= 0 && req_is_recv[idx]) { int recv_node = req_node[idx]; - data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_is_dev[recv_node]) + { + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + data_packer_with_device_buffer(dev_buffers.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + } + else + { + data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + } pending_recv--; } } @@ -5403,7 +5695,12 @@ void Parallel::transfer(MyList **src, MyList 0) MPI_Waitall(req_no, reqs, stats); - if (rec_data[myrank]) + if (recv_is_dev[myrank] && recv_lengths[myrank] > 0) + { + UncachedDeviceBuffers &dev_buffers = uncached_device_buffers(); + data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + } + else if (rec_data[myrank]) data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); for (node = 0; node < cpusize; node++) @@ -5423,6 +5720,9 @@ void Parallel::transfer(MyList **src, MyList **src, MyList **dst, @@ -7057,7 +7357,13 @@ void Parallel::prepare_inter_time_level(Patch *Pat, double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (cuda_state_count_direct_supported(state_count) && + const bool have_cuda_views = + cuda_state_count_direct_supported(state_count) && + cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views); + if (cuda_prepare_inter_time_device_enabled() && + have_cuda_views && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && @@ -7093,6 +7399,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat, varl2 = varl2->next; varl3 = varl3->next; } +#if USE_CUDA_BSSN + if (have_cuda_views && bssn_cuda_has_resident_state(cg)) + bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count); +#endif } if (BP == Pat->ble) break; @@ -7133,7 +7443,14 @@ void Parallel::prepare_inter_time_level(Patch *Pat, double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (cuda_state_count_direct_supported(state_count) && + const bool have_cuda_views = + cuda_state_count_direct_supported(state_count) && + cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && + cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views); + if (cuda_prepare_inter_time_device_enabled() && + have_cuda_views && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && @@ -7174,6 +7491,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat, varl3 = varl3->next; varl4 = varl4->next; } +#if USE_CUDA_BSSN + if (have_cuda_views && bssn_cuda_has_resident_state(cg)) + bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count); +#endif } if (BP == Pat->ble) break; diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index 37ce797..cd85057 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -90,6 +90,22 @@ bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analy return false; if (lev == analysis_lev) return false; + static int release_only_level = -2; + if (release_only_level == -2) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_RELEASE_ONLY_LEVEL"); + release_only_level = (env && atoi(env) >= 0) ? atoi(env) : -1; + } + if (release_only_level >= 0) + return lev != release_only_level; + static int keep_level_limit = -2; + if (keep_level_limit == -2) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_LEVELS_BELOW"); + keep_level_limit = (env && atoi(env) >= 0) ? atoi(env) : -1; + } + if (keep_level_limit >= 0) + return lev < keep_level_limit; if (keep_all_levels) return true; return lev < trfls_in; @@ -125,6 +141,138 @@ bool bssn_escalar_timing_enabled() return enabled != 0; } +bool bssn_escalar_cuda_post_rp_download_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_DOWNLOAD"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_escalar_cuda_post_rp_download_level_enabled(int lev) +{ + if (!bssn_escalar_cuda_post_rp_download_enabled()) + return false; + static int min_level = -2; + if (min_level == -2) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_MIN_LEVEL"); + min_level = (env && atoi(env) >= 0) ? atoi(env) : -1; + } + return min_level < 0 || lev >= min_level; +} + +bool bssn_escalar_cuda_post_swap_release_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_POST_SWAP_RELEASE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_escalar_cuda_pre_rp_release_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_PRE_RP_RELEASE"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + return enabled != 0; +} + +bool bssn_escalar_cuda_bh_interp_resident_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + return enabled != 0; +} + +bool bssn_escalar_cuda_prune_after_swap_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_PRUNE_AFTER_SWAP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +void bssn_escalar_cuda_upload_level_state(MyList *PatL, MyList *vars, + int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, vars, state_in)) + { + cout << "CUDA BSSN-EScalar resident state list mismatch during upload" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_escalar_cuda_upload_resident_state(cg, cg->shape, state_in)) + { + cout << "CUDA BSSN-EScalar resident state upload failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void bssn_escalar_cuda_keep_only_level_state(MyList *PatL, MyList *vars, + int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_key[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, vars, state_key)) + { + cout << "CUDA BSSN-EScalar resident state list mismatch during prune" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_escalar_cuda_keep_only_resident_state(cg, cg->shape, state_key)) + { + cout << "CUDA BSSN-EScalar resident state prune failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs, double sync, double bh, double analysis, double swap, double resident, double rp) @@ -1244,7 +1392,8 @@ void bssnEScalar_class::Step(int lev, int YN) { escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; #if USE_CUDA_BSSN - (void)use_cuda_resident_sync; + if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled()) + bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false); #endif compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) @@ -1670,7 +1819,8 @@ void bssnEScalar_class::Step(int lev, int YN) { escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; #if USE_CUDA_BSSN - (void)use_cuda_resident_sync; + if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled()) + bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); #endif compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) @@ -1760,7 +1910,8 @@ void bssnEScalar_class::Step(int lev, int YN) { escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev)) - bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, + bssn_escalar_cuda_pre_rp_release_enabled()); if (escalar_step_timing) escalar_t_resident += MPI_Wtime() - escalar_t0; } @@ -1833,9 +1984,28 @@ void bssnEScalar_class::Step(int lev, int YN) sPp = sPp->next; } } -#endif - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) +#endif + #if USE_CUDA_BSSN + bool release_after_sync = false; + if (use_cuda_resident_sync && bssn_escalar_cuda_post_rp_download_level_enabled(lev)) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + release_after_sync = bssn_escalar_cuda_post_swap_release_enabled(); + bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, release_after_sync); + if (escalar_step_timing) + escalar_t_resident += MPI_Wtime() - escalar_t0; + } + if (use_cuda_resident_sync && !release_after_sync && + bssn_escalar_cuda_prune_after_swap_enabled()) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + bssn_escalar_cuda_keep_only_level_state(GH->PatL[lev], StateList, myrank); + if (escalar_step_timing) + escalar_t_resident += MPI_Wtime() - escalar_t0; + } +#endif + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) { for (int ithBH = 0; ithBH < BH_num; ithBH++) { diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 123a175..d966a74 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -102,7 +102,15 @@ bool amss_cached_rp_restrict_enabled() { static int enabled = -1; if (enabled < 0) - enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0; + { +#if (ABEtype == 1) + enabled = 1; +#else + enabled = 0; +#endif + if (amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT")) + enabled = 1; + } return enabled != 0; } @@ -110,7 +118,15 @@ bool amss_cached_rp_outbd_enabled() { static int enabled = -1; if (enabled < 0) - enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0; + { +#if (ABEtype == 1) + enabled = 1; +#else + enabled = 0; +#endif + if (amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD")) + enabled = 1; + } return enabled != 0; } @@ -118,7 +134,15 @@ bool amss_cached_rp_fine_sync_enabled() { static int enabled = -1; if (enabled < 0) - enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0; + { +#if (ABEtype == 1) + enabled = 1; +#else + enabled = 0; +#endif + if (amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC")) + enabled = 1; + } return enabled != 0; } @@ -819,6 +843,17 @@ bool bssn_cuda_regrid_flush_enabled() return enabled != 0; } +bool bssn_cuda_regrid_flush_always_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_REGRID_FLUSH_ALWAYS"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) { if (!GH || lev < GH->movls || lev >= GH->levels || !GH->PatL[lev]) @@ -882,8 +917,11 @@ bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num, bool bssn_cuda_should_flush_before_regrid(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) { - return bssn_cuda_regrid_flush_enabled() && - bssn_cuda_will_regrid_onelevel(GH, lev, Symmetry, BH_num, Porg0); + if (!bssn_cuda_regrid_flush_enabled()) + return false; + if (bssn_cuda_regrid_flush_always_enabled()) + return GH && lev >= GH->movls && lev < GH->levels && GH->PatL[lev]; + return bssn_cuda_will_regrid_onelevel(GH, lev, Symmetry, BH_num, Porg0); } void bssn_cuda_sync_level_bh_fields(MyList *PatL, @@ -925,6 +963,27 @@ bool bssn_constraint_recompute_from_state(int lev, bool level0_cache_valid) } // namespace #endif +#if USE_CUDA_BSSN +bool bssn_cuda_bh_interp_resident_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT"); + if (env) + enabled = (atoi(env) != 0) ? 1 : 0; +#if (ABEtype == 1) + else + enabled = 1; +#else + else + enabled = 1; +#endif + } + return enabled != 0; +} +#endif + //================================================================================================ // define bssn_class @@ -3895,10 +3954,11 @@ void bssn_class::ParallelStep() // a_stream<Regrid_Onelevel_aux for lower level"; // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],a_stream.str()); } - } - } -#endif } +} + +#endif +} #ifdef WithShell SHStep(); @@ -8392,7 +8452,8 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va int lev = ilev; #if USE_CUDA_BSSN - if (bssn_cuda_use_resident_sync(lev) && + if (bssn_cuda_bh_interp_resident_enabled() && + bssn_cuda_use_resident_sync(lev) && bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf)) { BH_RHS[n][0] = -shellf[0]; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index da0ccd3..a26257f 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -502,7 +502,7 @@ static const int STAGE_SLOT_COUNT = static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; -static constexpr int BSSN_RESIDENT_BANK_COUNT = 4; +static constexpr int BSSN_RESIDENT_BANK_COUNT = 6; static constexpr int BSSN_ESCALAR_STATE_COUNT = 26; static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT; @@ -5285,11 +5285,24 @@ static bool resident_key_matches(const StepContext &ctx, int bank, double **host static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count) { if (!host_key) return -1; + int best = -1; + unsigned long long best_age = 0; + int best_invalid = -1; + unsigned long long best_invalid_age = 0; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { - if (resident_key_matches_count(ctx, b, host_key, state_count)) - return b; + if (!resident_key_matches_count(ctx, b, host_key, state_count)) + continue; + if (ctx.resident_valid[b]) { + if (best < 0 || ctx.resident_age[b] > best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) { + best_invalid = b; + best_invalid_age = ctx.resident_age[b]; + } } - return -1; + return (best >= 0) ? best : best_invalid; } static int find_resident_bank_subset(const StepContext &ctx, @@ -5299,6 +5312,10 @@ static int find_resident_bank_subset(const StepContext &ctx, { if (!host_key || !state_indices || subset_count <= 0) return -1; + int best = -1; + unsigned long long best_age = 0; + int best_invalid = -1; + unsigned long long best_invalid_age = 0; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { bool match = true; for (int i = 0; i < subset_count; ++i) { @@ -5310,10 +5327,19 @@ static int find_resident_bank_subset(const StepContext &ctx, break; } } - if (match) - return b; + if (!match) + continue; + if (ctx.resident_valid[b]) { + if (best < 0 || ctx.resident_age[b] > best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) { + best_invalid = b; + best_invalid_age = ctx.resident_age[b]; + } } - return -1; + return (best >= 0) ? best : best_invalid; } static int find_resident_bank(const StepContext &ctx, double **host_key) @@ -5373,6 +5399,16 @@ static void mark_resident_host_subset_clean(StepContext &ctx, } } +static void mark_resident_host_state_clean(StepContext &ctx, + int bank, + int state_index, + bool clean) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return; + ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0; +} + static void mark_resident_current_bank(StepContext &ctx, int bank) { if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; @@ -5632,6 +5668,12 @@ static int reserve_escalar_resident_output_bank(StepContext &ctx, static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c); +static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx, + int avoid_a, + int avoid_b, + int avoid_c, + size_t all); + static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, double **host_key, size_t all, @@ -5658,7 +5700,7 @@ static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, } } if (bank < 0) - bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all); + bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); ctx.resident_valid[bank] = false; ctx.resident_age[bank] = ++ctx.resident_clock; @@ -5734,12 +5776,140 @@ static int reserve_resident_output_bank_avoiding(StepContext &ctx, return bank; } +static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx, + int avoid_a, + int avoid_b, + int avoid_c, + size_t all) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) + return choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all); + + writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + +static int ensure_resident_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable(host_key)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank(ctx, host_key); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); + assign_resident_key(ctx, bank, host_key); + if (upload_if_missing) { + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(host_key, all); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + +static int ensure_escalar_resident_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_escalar_state_input_slots(ctx.d_resident[bank]); + upload_escalar_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); + if (upload_if_missing) { + bind_escalar_state_input_slots(ctx.d_resident[bank]); + upload_escalar_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + static int active_or_keyed_bank(StepContext &ctx, double **host_key, size_t all, - bool upload_if_missing) + bool upload_if_missing, + int state_count = BSSN_STATE_COUNT) { - if (resident_key_usable(host_key)) { + if (state_count == BSSN_ESCALAR_STATE_COUNT && + resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) { + int bank = ensure_escalar_resident_bank(ctx, host_key, all, upload_if_missing); + mark_resident_current_bank(ctx, bank); + return bank; + } + if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) { int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing); mark_resident_current_bank(ctx, bank); return bank; @@ -6200,6 +6370,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag, g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + if (use_resident_state && input_bank >= 0) + set_resident_host_clean(ctx, input_bank, false); } if (RK4 == 0) { @@ -6957,9 +7129,10 @@ static void copy_state_region_cuda(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + mark_resident_host_state_clean(ctx, bank, state_index, false); update_state_ready(ctx); } else { - ctx.resident_host_clean[bank][state_index] = 1; + mark_resident_host_state_clean(ctx, bank, state_index, true); } } @@ -6970,9 +7143,11 @@ static void copy_state_region_packed_cuda(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz, cudaMemcpyKind kind, - double **state_host_key = nullptr) + double **state_host_key = nullptr, + int state_count = BSSN_STATE_COUNT) { if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; const size_t src_pitch = (size_t)ex[0] * sizeof(double); @@ -6980,7 +7155,8 @@ static void copy_state_region_packed_cuda(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - kind == cudaMemcpyHostToDevice); + kind == cudaMemcpyHostToDevice, + state_count); double *base_mem = ctx.d_resident_mem[bank]; cudaMemcpy3DParms p = {}; @@ -7003,9 +7179,10 @@ static void copy_state_region_packed_cuda(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + mark_resident_host_state_clean(ctx, bank, state_index, false); update_state_ready(ctx); } else { - ctx.resident_host_clean[bank][state_index] = 1; + mark_resident_host_state_clean(ctx, bank, state_index, true); } } @@ -7024,7 +7201,8 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - kind == cudaMemcpyHostToDevice); + kind == cudaMemcpyHostToDevice, + state_count); double *base_mem = ctx.d_resident_mem[bank]; const int region_all = sx * sy * sz; const size_t total_doubles = (size_t)state_count * (size_t)region_all; @@ -7057,6 +7235,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, false); update_state_ready(ctx); } } @@ -7067,10 +7246,15 @@ static void download_resident_state_count(void *block_tag, int *ex, double **sta const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); int bank = find_resident_bank_count(ctx, state_host_out, state_count); + bool bank_matches_output_key = (bank >= 0); if (bank < 0) { bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false); } mark_resident_current_bank(ctx, bank); + if (!bank_matches_output_key && + resident_key_usable_count(state_host_out, state_count)) { + assign_resident_key_count(ctx, bank, state_host_out, state_count); + } const bool profile = cuda_profile_enabled(); const double t0 = profile ? cuda_profile_now_ms() : 0.0; static int direct_download = -1; @@ -7117,6 +7301,72 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT); } +static void upload_resident_state_count(void *block_tag, int *ex, double **state_host_in, int state_count) +{ + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + StepContext &ctx = ensure_step_ctx(block_tag, all); + int bank = -1; + if (state_count == BSSN_ESCALAR_STATE_COUNT) { + bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false); + bind_escalar_state_input_slots(ctx.d_resident[bank]); + upload_escalar_state_inputs(state_host_in, all); + } else if (state_count == BSSN_STATE_COUNT) { + bank = ensure_resident_bank(ctx, state_host_in, all, false); + bind_state_input_slots(ctx.d_resident[bank]); + upload_state_inputs(state_host_in, all); + } else { + return; + } + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + ctx.resident_age[bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, bank, true); + mark_resident_current_bank(ctx, bank); + update_state_ready(ctx); +} + +static void keep_only_resident_state_count(void *block_tag, + int *ex, + double **state_host_key, + int state_count) +{ + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) + return; + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return; + + StepContext &ctx = it->second; + const int keep_bank = find_resident_bank_count(ctx, state_host_key, state_count); + if (keep_bank < 0 || !ctx.resident_valid[keep_bank]) + return; + + auto keep_clean = ctx.resident_host_clean[keep_bank]; + + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + ctx.resident_valid[b] = false; + ctx.resident_host[b].fill(nullptr); + ctx.resident_host_clean[b].fill(0); + ctx.resident_age[b] = 0; + } + ctx.d_state_curr_mem = nullptr; + ctx.d_state_next_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + ctx.d_state_next.fill(nullptr); + ctx.current_bank = -1; + ctx.resident_clock = 0; + ctx.matter_ready = false; + + for (int i = 0; i < state_count; ++i) { + ctx.resident_host[keep_bank][i] = state_host_key[i]; + ctx.resident_host_clean[keep_bank][i] = keep_clean[i] ? 1 : 0; + } + ctx.resident_valid[keep_bank] = true; + ctx.resident_age[keep_bank] = ++ctx.resident_clock; + mark_resident_current_bank(ctx, keep_bank); + (void)ex; + update_state_ready(ctx); +} + static bool download_resident_state_count_if_present(void *block_tag, int *ex, double **state_host_out, @@ -7183,8 +7433,27 @@ static void copy_state_subset(void *block_tag, const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr; - const int bank = active_or_keyed_bank(ctx, full_key, all, - kind == cudaMemcpyHostToDevice); + int bank = -1; + if (state_host) { + if (full_key) { + bank = (subset_count == BSSN_ESCALAR_STATE_COUNT) + ? find_resident_bank_count(ctx, full_key, BSSN_ESCALAR_STATE_COUNT) + : find_resident_bank(ctx, full_key); + } else { + bank = find_resident_bank_subset(ctx, state_host, state_indices, subset_count); + } + if (kind == cudaMemcpyDeviceToHost && + (bank < 0 || !ctx.resident_valid[bank])) { + bank = -1; + } + } + if (bank < 0) { + bank = active_or_keyed_bank(ctx, full_key, all, + kind == cudaMemcpyHostToDevice, + subset_count); + } else { + mark_resident_current_bank(ctx, bank); + } double *base_mem = ctx.d_resident_mem[bank]; int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY]; double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY]; @@ -7834,6 +8103,52 @@ int bssn_escalar_cuda_download_resident_state(void *block_tag, return 0; } +extern "C" +int bssn_cuda_upload_resident_state_count(void *block_tag, + int *ex, + double **state_host_in, + int state_count) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + return 1; + upload_resident_state_count(block_tag, ex, state_host_in, state_count); + return 0; +} + +extern "C" +int bssn_escalar_cuda_upload_resident_state(void *block_tag, + int *ex, + double **state_host_in) +{ + return bssn_cuda_upload_resident_state_count(block_tag, ex, state_host_in, + BSSN_ESCALAR_STATE_COUNT); +} + +extern "C" +int bssn_cuda_keep_only_resident_state_count(void *block_tag, + int *ex, + double **state_host_key, + int state_count) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + return 1; + keep_only_resident_state_count(block_tag, ex, state_host_key, state_count); + return 0; +} + +extern "C" +int bssn_escalar_cuda_keep_only_resident_state(void *block_tag, + int *ex, + double **state_host_key) +{ + return bssn_cuda_keep_only_resident_state_count(block_tag, ex, state_host_key, + BSSN_ESCALAR_STATE_COUNT); +} + extern "C" int bssn_cuda_download_resident_state_count_if_present(void *block_tag, int *ex, @@ -8032,6 +8347,28 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!state_host_key || + (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)) + return 1; + copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice, + state_host_key, state_count); + return 0; +} + extern "C" int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, int state_count, @@ -8115,7 +8452,8 @@ static void copy_state_device_batch(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - pack_not_unpack == 0 || state_host_key != nullptr); + pack_not_unpack == 0 || state_host_key != nullptr, + state_count); double *base_mem = ctx.d_resident_mem[bank]; const int region_all = sx * sy * sz; dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -8164,7 +8502,8 @@ static void copy_state_device_segments(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - pack_not_unpack == 0 || state_host_key != nullptr); + pack_not_unpack == 0 || state_host_key != nullptr, + state_count); double *base_mem = ctx.d_resident_mem[bank]; int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, @@ -8187,6 +8526,7 @@ static void copy_state_device_segments(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + set_resident_host_clean(ctx, bank, false); update_state_ready(ctx); } } @@ -8214,7 +8554,8 @@ static void restrict_state_device_segments(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - state_host_key != nullptr); + state_host_key != nullptr, + state_count); int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, (size_t)segment_count * 8 * sizeof(int), @@ -8253,7 +8594,8 @@ static void prolong_state_device_segments(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - state_host_key != nullptr); + state_host_key != nullptr, + state_count); int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11); CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, (size_t)segment_count * 11 * sizeof(int), @@ -8498,7 +8840,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; - const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); + const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count); const int region_all = sx * sy * sz; upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -8555,7 +8897,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; - const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); + const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count); const int region_all = sx * sy * sz; upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -8650,7 +8992,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true); src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank); src3_bank = (source_count == 3) - ? ensure_escalar_resident_bank(ctx, src3_host_key, all, true, src1_bank) + ? ensure_escalar_resident_bank_avoiding(ctx, src3_host_key, all, true, + src1_bank, src2_bank, -1) : -1; dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all, src1_bank, src2_bank, src3_bank); @@ -8658,7 +9001,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); src3_bank = (source_count == 3) - ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank) + ? ensure_resident_bank_avoiding(ctx, src3_host_key, all, true, + src1_bank, src2_bank, -1) : -1; dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all, src1_bank, src2_bank, src3_bank); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 38f945a..946137d 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -104,6 +104,24 @@ int bssn_escalar_cuda_download_resident_state(void *block_tag, int *ex, double **state_host_out); +int bssn_cuda_upload_resident_state_count(void *block_tag, + int *ex, + double **state_host_in, + int state_count); + +int bssn_escalar_cuda_upload_resident_state(void *block_tag, + int *ex, + double **state_host_in); + +int bssn_cuda_keep_only_resident_state_count(void *block_tag, + int *ex, + double **state_host_key, + int state_count); + +int bssn_escalar_cuda_keep_only_resident_state(void *block_tag, + int *ex, + double **state_host_key); + int bssn_cuda_download_resident_state_count_if_present(void *block_tag, int *ex, double **state_host_out, @@ -169,6 +187,15 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, int state_count, double *host_buffer,