diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index b87ce6d..b0fbb57 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1,12 +1,203 @@ #include "Parallel.h" -#include "fmisc.h" -#include "prolongrestrict.h" -#include "misc.h" -#include "parameters.h" - -int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion -{ +#include "fmisc.h" +#include "prolongrestrict.h" +#include "misc.h" +#include "parameters.h" +#include +#include +#if USE_CUDA_BSSN +#include +#include "bssn_rhs_cuda.h" +#endif + +namespace { + +struct SyncProfileStats +{ + long long start_calls; + long long finish_calls; + double start_sec; + double finish_sec; + double direct_pack_sec; + double direct_unpack_sec; + double wait_sec; +}; + +SyncProfileStats &sync_profile_stats() +{ + static SyncProfileStats stats = {0, 0, 0.0, 0.0, 0.0, 0.0, 0.0}; + return stats; +} + +bool sync_profile_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_PROFILE_SYNC"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +int sync_profile_every() +{ + static int every = -1; + if (every < 0) + { + const char *env = getenv("AMSS_PROFILE_SYNC_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 100; + } + return every; +} + +void sync_profile_maybe_log() +{ + if (!sync_profile_enabled()) + return; + SyncProfileStats &stats = sync_profile_stats(); + if (stats.finish_calls <= 0 || stats.finish_calls % sync_profile_every() != 0) + return; + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + fprintf(stderr, + "[AMSS-SYNC][rank %d] start=%lld finish=%lld avg_start=%.6f s avg_finish=%.6f s avg_wait=%.6f s avg_cuda_pack=%.6f s avg_cuda_unpack=%.6f s\n", + rank, + stats.start_calls, + stats.finish_calls, + stats.start_calls ? stats.start_sec / (double)stats.start_calls : 0.0, + stats.finish_calls ? stats.finish_sec / (double)stats.finish_calls : 0.0, + stats.finish_calls ? stats.wait_sec / (double)stats.finish_calls : 0.0, + stats.finish_calls ? stats.direct_pack_sec / (double)stats.finish_calls : 0.0, + stats.finish_calls ? stats.direct_unpack_sec / (double)stats.finish_calls : 0.0); + fflush(stderr); +} + +bool cuda_sync_pinned_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_PINNED_SYNC"); +#if USE_CUDA_BSSN + enabled = (!env || atoi(env) != 0) ? 1 : 0; +#else + enabled = 0; +#endif + } + return enabled != 0; +} + +void free_comm_buffer(double *&ptr, unsigned char &is_pinned) +{ + if (!ptr) + return; +#if USE_CUDA_BSSN + if (is_pinned) + cudaFreeHost(ptr); + else + delete[] ptr; +#else + delete[] ptr; +#endif + ptr = 0; + is_pinned = 0; +} + +double *alloc_comm_buffer(int length, unsigned char &is_pinned) +{ + is_pinned = 0; + if (length <= 0) + return 0; +#if USE_CUDA_BSSN + if (cuda_sync_pinned_enabled()) + { + double *ptr = 0; + cudaError_t err = cudaMallocHost((void **)&ptr, (size_t)length * sizeof(double)); + if (err == cudaSuccess) + { + is_pinned = 1; + return ptr; + } + } +#endif + return new double[length]; +} + +void ensure_comm_buffer(double **buffers, unsigned char *pinned_flags, int *caps, int idx, int length) +{ + if (length <= caps[idx]) + return; + free_comm_buffer(buffers[idx], pinned_flags[idx]); + buffers[idx] = alloc_comm_buffer(length, pinned_flags[idx]); + if (!buffers[idx]) + { + fprintf(stderr, "Parallel: failed to allocate communication buffer (%d doubles)\n", length); + MPI_Abort(MPI_COMM_WORLD, 1); + } + caps[idx] = length; +} + +int cuda_seg_begin(const Parallel::gridseg *seg, Block *bg, int dir) +{ + const double dx = bg->getdX(dir); + return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5); +} + +#if USE_CUDA_BSSN +bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) +{ + if (type != 1 || !src || !dst || !src->Bg) + return false; + return bssn_cuda_has_resident_state(src->Bg) != 0; +} + +bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) +{ + if (type != 1 || !dst || !dst->Bg) + return false; + return bssn_cuda_has_resident_state(dst->Bg) != 0; +} + +bool cuda_direct_pack_segment(double *buffer, + const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int state_index) +{ + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + const bool ok = bssn_cuda_pack_state_region_to_host_buffer(src->Bg, state_index, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return ok; +} + +bool cuda_direct_unpack_segment(double *buffer, + const Parallel::gridseg *dst, + int state_index) +{ + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + const bool ok = bssn_cuda_unpack_state_region_from_host_buffer(dst->Bg, state_index, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; +} +#endif + +} // namespace + +int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion +{ nx = Mymax(1, shape / min_width); nx = Mymin(cpusize, nx); @@ -3754,22 +3945,46 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || - (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) - { - varls = VarLists; - varld = VarListd; - while (varls && varld) - { - if (data) - { - if (dir == PACK) - switch (type) - { - // attention must be paied to the difference between src's llb,uub and dst's llb,uub - case 1: + while (src && dst) + { + if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) + { + varls = VarLists; + varld = VarListd; + int state_idx = 0; + while (varls && varld) + { + if (data) + { +#if USE_CUDA_BSSN + bool handled_by_cuda = false; + if (dir == PACK && cuda_can_direct_pack(src->data, dst->data, type)) + { + handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_idx); + if (!handled_by_cuda) + { + cout << "Parallel::data_packer: CUDA direct pack failed." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + else if (dir == UNPACK && cuda_can_direct_unpack(dst->data, type)) + { + handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_idx); + if (!handled_by_cuda) + { + cout << "Parallel::data_packer: CUDA direct unpack failed." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (!handled_by_cuda) + { +#endif + if (dir == PACK) + switch (type) + { + // attention must be paied to the difference between src's llb,uub and dst's llb,uub + case 1: f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], dst->data->llb, dst->data->uub); @@ -3784,18 +3999,22 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); } - if (dir == UNPACK) // from target data to corresponding grid - f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], - dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, - dst->data->llb, dst->data->uub); - } - size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; - varls = varls->next; - varld = varld->next; - } - } - dst = dst->next; - src = src->next; + if (dir == UNPACK) // from target data to corresponding grid + f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], + dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + dst->data->llb, dst->data->uub); +#if USE_CUDA_BSSN + } +#endif + } + size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + varls = varls->next; + varld = varld->next; + ++state_idx; + } + } + dst = dst->next; + src = src->next; } return size_out; @@ -4316,13 +4535,14 @@ void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmet delete[] combined_dst; } // SyncCache constructor -Parallel::SyncCache::SyncCache() - : valid(false), cpusize(0), combined_src(0), combined_dst(0), - send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), - send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0), - lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0) -{ -} +Parallel::SyncCache::SyncCache() + : valid(false), cpusize(0), combined_src(0), combined_dst(0), + send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), + send_buf_caps(0), recv_buf_caps(0), send_buf_pinned(0), recv_buf_pinned(0), + reqs(0), stats(0), max_reqs(0), + lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0) +{ +} // SyncCache invalidate: free grid segment lists but keep buffers void Parallel::SyncCache::invalidate() { @@ -4346,34 +4566,51 @@ void Parallel::SyncCache::destroy() invalidate(); if (combined_src) delete[] combined_src; if (combined_dst) delete[] combined_dst; - if (send_lengths) delete[] send_lengths; - if (recv_lengths) delete[] recv_lengths; - if (send_buf_caps) delete[] send_buf_caps; - if (recv_buf_caps) delete[] recv_buf_caps; - for (int i = 0; i < cpusize; i++) - { - if (send_bufs && send_bufs[i]) delete[] send_bufs[i]; - if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i]; - } - if (send_bufs) delete[] send_bufs; - if (recv_bufs) delete[] recv_bufs; - if (reqs) delete[] reqs; - if (stats) delete[] stats; - if (tc_req_node) delete[] tc_req_node; - if (tc_req_is_recv) delete[] tc_req_is_recv; - if (tc_completed) delete[] tc_completed; - combined_src = combined_dst = 0; - send_lengths = recv_lengths = 0; - send_buf_caps = recv_buf_caps = 0; - send_bufs = recv_bufs = 0; - reqs = 0; stats = 0; + if (send_lengths) delete[] send_lengths; + if (recv_lengths) delete[] recv_lengths; + if (send_buf_caps) delete[] send_buf_caps; + if (recv_buf_caps) delete[] recv_buf_caps; + for (int i = 0; i < cpusize; i++) + { + if (send_bufs && send_bufs[i]) + { +#if USE_CUDA_BSSN + free_comm_buffer(send_bufs[i], send_buf_pinned[i]); +#else + delete[] send_bufs[i]; +#endif + } + if (recv_bufs && recv_bufs[i]) + { +#if USE_CUDA_BSSN + free_comm_buffer(recv_bufs[i], recv_buf_pinned[i]); +#else + delete[] recv_bufs[i]; +#endif + } + } + if (send_bufs) delete[] send_bufs; + if (recv_bufs) delete[] recv_bufs; + if (send_buf_pinned) delete[] send_buf_pinned; + if (recv_buf_pinned) delete[] recv_buf_pinned; + if (reqs) delete[] reqs; + if (stats) delete[] stats; + if (tc_req_node) delete[] tc_req_node; + if (tc_req_is_recv) delete[] tc_req_is_recv; + if (tc_completed) delete[] tc_completed; + combined_src = combined_dst = 0; + send_lengths = recv_lengths = 0; + send_buf_caps = recv_buf_caps = 0; + send_bufs = recv_bufs = 0; + send_buf_pinned = recv_buf_pinned = 0; + reqs = 0; stats = 0; tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0; cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache -void Parallel::transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) { int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); @@ -4393,16 +4630,11 @@ void Parallel::transfer_cached(MyList **src, MyList 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 1; req_no++; @@ -4412,17 +4644,12 @@ void Parallel::transfer_cached(MyList **src, MyList 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len); + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } // Pack and post sends. for (node = 0; node < cpusize; node++) @@ -4430,16 +4657,11 @@ void Parallel::transfer_cached(MyList **src, MyList 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 0; @@ -4467,209 +4689,123 @@ void Parallel::transfer_cached(MyList **src, MyList 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); -} -void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) -{ - if (!cache.valid) - { - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - cache.cpusize = cpusize; - - // Allocate cache arrays if needed - if (!cache.combined_src) - { - cache.combined_src = new MyList *[cpusize]; - cache.combined_dst = new MyList *[cpusize]; - cache.send_lengths = new int[cpusize]; - cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - } - cache.max_reqs = 2 * cpusize; - cache.reqs = new MPI_Request[cache.max_reqs]; - cache.stats = new MPI_Status[cache.max_reqs]; - cache.tc_req_node = new int[cache.max_reqs]; - cache.tc_req_is_recv = new int[cache.max_reqs]; - cache.tc_completed = new int[cache.max_reqs]; - } - - for (int node = 0; node < cpusize; node++) - { - cache.combined_src[node] = cache.combined_dst[node] = 0; - cache.send_lengths[node] = cache.recv_lengths[node] = 0; - } - - // Build intra-patch segments (same as Sync_merged Phase A) - MyList *Pp = PatL; - while (Pp) - { - Patch *Pat = Pp->data; - MyList *dst_ghost = build_ghost_gsl(Pat); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl0(Pat, node); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_ghost, &tsrc, &tdst); - if (tsrc) - { - if (cache.combined_src[node]) - cache.combined_src[node]->catList(tsrc); - else - cache.combined_src[node] = tsrc; - } - if (tdst) - { - if (cache.combined_dst[node]) - cache.combined_dst[node]->catList(tdst); - else - cache.combined_dst[node] = tdst; - } - if (src_owned) src_owned->destroyList(); - } - if (dst_ghost) dst_ghost->destroyList(); - Pp = Pp->next; - } - - // Build inter-patch segments (same as Sync_merged Phase B) - MyList *dst_buffer = build_buffer_gsl(PatL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_buffer, &tsrc, &tdst); - if (tsrc) - { - if (cache.combined_src[node]) - cache.combined_src[node]->catList(tsrc); - else - cache.combined_src[node] = tsrc; - } - if (tdst) - { - if (cache.combined_dst[node]) - cache.combined_dst[node]->catList(tdst); - else - cache.combined_dst[node] = tdst; - } - if (src_owned) src_owned->destroyList(); - } - if (dst_buffer) dst_buffer->destroyList(); - - cache.valid = true; - } - - // Use cached lists with buffer-reusing transfer - transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache); -} + + if (self_len > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); +} +void Parallel::Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &cache) +{ + if (cache.valid) + return; + + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; + } + + for (int node = 0; node < cpusize; node++) + { + cache.combined_src[node] = cache.combined_dst[node] = 0; + cache.send_lengths[node] = cache.recv_lengths[node] = 0; + } + + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_ghost) dst_ghost->destroyList(); + Pp = Pp->next; + } + + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_buffer) dst_buffer->destroyList(); + + cache.valid = true; +} +void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) +{ + Sync_ensure_cache(PatL, Symmetry, cache); + + // Use cached lists with buffer-reusing transfer + transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache); +} // Sync_start: pack and post MPI_Isend/Irecv, return immediately -void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetry, - SyncCache &cache, AsyncSyncState &state) -{ - // Ensure cache is built - if (!cache.valid) - { - // Build cache (same logic as Sync_cached) - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - cache.cpusize = cpusize; - - if (!cache.combined_src) - { - cache.combined_src = new MyList *[cpusize]; - cache.combined_dst = new MyList *[cpusize]; - cache.send_lengths = new int[cpusize]; - cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - } - cache.max_reqs = 2 * cpusize; - cache.reqs = new MPI_Request[cache.max_reqs]; - cache.stats = new MPI_Status[cache.max_reqs]; - cache.tc_req_node = new int[cache.max_reqs]; - cache.tc_req_is_recv = new int[cache.max_reqs]; - cache.tc_completed = new int[cache.max_reqs]; - } - - for (int node = 0; node < cpusize; node++) - { - cache.combined_src[node] = cache.combined_dst[node] = 0; - cache.send_lengths[node] = cache.recv_lengths[node] = 0; - } - - MyList *Pp = PatL; - while (Pp) - { - Patch *Pat = Pp->data; - MyList *dst_ghost = build_ghost_gsl(Pat); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl0(Pat, node); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_ghost, &tsrc, &tdst); - if (tsrc) - { - if (cache.combined_src[node]) - cache.combined_src[node]->catList(tsrc); - else - cache.combined_src[node] = tsrc; - } - if (tdst) - { - if (cache.combined_dst[node]) - cache.combined_dst[node]->catList(tdst); - else - cache.combined_dst[node] = tdst; - } - if (src_owned) src_owned->destroyList(); - } - if (dst_ghost) dst_ghost->destroyList(); - Pp = Pp->next; - } - - MyList *dst_buffer = build_buffer_gsl(PatL); - for (int node = 0; node < cpusize; node++) - { - MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); - MyList *tsrc = 0, *tdst = 0; - build_gstl(src_owned, dst_buffer, &tsrc, &tdst); - if (tsrc) - { - if (cache.combined_src[node]) - cache.combined_src[node]->catList(tsrc); - else - cache.combined_src[node] = tsrc; - } - if (tdst) - { - if (cache.combined_dst[node]) - cache.combined_dst[node]->catList(tdst); - else - cache.combined_dst[node] = tdst; - } - if (src_owned) src_owned->destroyList(); - } - if (dst_buffer) dst_buffer->destroyList(); - cache.valid = true; - } - - // Now pack and post async MPI operations - int myrank; +void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state) +{ + const double t_start = sync_profile_enabled() ? MPI_Wtime() : 0.0; + Sync_ensure_cache(PatL, Symmetry, cache); + + // Now pack and post async MPI operations + int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; state.req_no = 0; @@ -4679,125 +4815,139 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr delete[] state.req_node; delete[] state.req_is_recv; state.req_node = new int[cache.max_reqs]; state.req_is_recv = new int[cache.max_reqs]; - - MyList **src = cache.combined_src; - MyList **dst = cache.combined_dst; - - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) - { - int length; + + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + continue; + int rlength; + if (!cache.lengths_valid) { + rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = rlength; + } else { + rlength = cache.recv_lengths[node]; + } + if (rlength > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 1; + state.pending_recv++; + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + } + + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length; if (!cache.lengths_valid) { length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); cache.recv_lengths[node] = length; } else { length = cache.recv_lengths[node]; - } - if (length > 0) - { - if (length > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[length]; - cache.recv_buf_caps[node] = length; - } - data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - } - } - else - { + } + if (length > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length); + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + } + else + { int slength; if (!cache.lengths_valid) { slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); cache.send_lengths[node] = slength; } else { slength = cache.send_lengths[node]; - } - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); - state.req_node[state.req_no] = node; - state.req_is_recv[state.req_no] = 0; - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); - } - int rlength; - if (!cache.lengths_valid) { - rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); - cache.recv_lengths[node] = rlength; - } else { - rlength = cache.recv_lengths[node]; - } - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - state.req_node[state.req_no] = node; - state.req_is_recv[state.req_no] = 1; - state.pending_recv++; - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); - } - } - } - cache.lengths_valid = true; -} -// Sync_finish: progressive unpack as receives complete, then wait for sends -void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, - MyList *VarList, int Symmetry) -{ + } + if (slength > 0) + { + ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 0; + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + } + } + cache.lengths_valid = true; + if (sync_profile_enabled()) + { + SyncProfileStats &stats = sync_profile_stats(); + stats.start_calls++; + stats.start_sec += MPI_Wtime() - t_start; + } +} +// Sync_finish: progressive unpack as receives complete, then wait for sends +void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry) +{ if (!state.active) return; - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - MyList **src = cache.combined_src; - MyList **dst = cache.combined_dst; - - // Unpack local data first (no MPI needed) - if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + const double t_finish = sync_profile_enabled() ? MPI_Wtime() : 0.0; + double wait_sec = 0.0; + + // Unpack local data first (no MPI needed) + if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); // Progressive unpack of remote receives - if (state.pending_recv > 0 && state.req_no > 0) - { - int pending = state.pending_recv; - int *completed = new int[cache.max_reqs]; - while (pending > 0) - { - int outcount = 0; - MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && state.req_is_recv[idx]) - { - int recv_node = state.req_node[idx]; - data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); - pending--; - } - } - } - delete[] completed; - } - - // Wait for remaining sends - if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats); - - delete[] state.req_node; state.req_node = 0; - delete[] state.req_is_recv; state.req_is_recv = 0; - state.active = false; -} + if (state.pending_recv > 0 && state.req_no > 0) + { + int pending = state.pending_recv; + while (pending > 0) + { + int outcount = 0; + const double t_wait = sync_profile_enabled() ? MPI_Wtime() : 0.0; + MPI_Waitsome(state.req_no, cache.reqs, &outcount, cache.tc_completed, cache.stats); + if (sync_profile_enabled()) + wait_sec += MPI_Wtime() - t_wait; + if (outcount == MPI_UNDEFINED) break; + for (int i = 0; i < outcount; i++) + { + int idx = cache.tc_completed[i]; + if (idx >= 0 && state.req_is_recv[idx]) + { + int recv_node = state.req_node[idx]; + data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + pending--; + } + } + } + } + + // Wait for remaining sends + if (state.req_no > 0) + { + const double t_wait = sync_profile_enabled() ? MPI_Wtime() : 0.0; + MPI_Waitall(state.req_no, cache.reqs, cache.stats); + if (sync_profile_enabled()) + wait_sec += MPI_Wtime() - t_wait; + } + + delete[] state.req_node; state.req_node = 0; + delete[] state.req_is_recv; state.req_is_recv = 0; + state.active = false; + if (sync_profile_enabled()) + { + SyncProfileStats &stats = sync_profile_stats(); + stats.finish_calls++; + stats.finish_sec += MPI_Wtime() - t_finish; + stats.wait_sec += wait_sec; + sync_profile_maybe_log(); + } +} // collect buffer grid segments or blocks for the periodic boundary condition of given patch // --------------------------------------------------- // |con | |con | @@ -5850,15 +6000,18 @@ void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, cache.combined_dst = new MyList *[cpusize]; cache.send_lengths = new int[cpusize]; cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - } + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + } cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; @@ -5899,15 +6052,18 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, cache.combined_dst = new MyList *[cpusize]; cache.send_lengths = new int[cpusize]; cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - } + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + } cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; @@ -5948,15 +6104,18 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.combined_dst = new MyList *[cpusize]; cache.send_lengths = new int[cpusize]; cache.recv_lengths = new int[cpusize]; - cache.send_bufs = new double *[cpusize]; - cache.recv_bufs = new double *[cpusize]; - cache.send_buf_caps = new int[cpusize]; - cache.recv_buf_caps = new int[cpusize]; - for (int i = 0; i < cpusize; i++) - { - cache.send_bufs[i] = cache.recv_bufs[i] = 0; - cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; - } + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + cache.send_buf_pinned = new unsigned char[cpusize]; + cache.recv_buf_pinned = new unsigned char[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0; + } cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; @@ -5995,16 +6154,11 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, if (node == myrank) continue; int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength); + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 1; req_no++; @@ -6014,17 +6168,12 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, // Local transfer on this rank. int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[myrank] = self_len; - if (self_len > 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len); + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } // Pack and post sends. for (int node = 0; node < cpusize; node++) @@ -6032,16 +6181,11 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, if (node == myrank) continue; int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength); + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 0; diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 0ab975c..ef8569a 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -100,12 +100,14 @@ namespace Parallel MyList **combined_dst; int *send_lengths; int *recv_lengths; - double **send_bufs; - double **recv_bufs; - int *send_buf_caps; - int *recv_buf_caps; - MPI_Request *reqs; - MPI_Status *stats; + double **send_bufs; + double **recv_bufs; + int *send_buf_caps; + int *recv_buf_caps; + unsigned char *send_buf_pinned; + unsigned char *recv_buf_pinned; + MPI_Request *reqs; + MPI_Status *stats; int max_reqs; bool lengths_valid; int *tc_req_node; @@ -116,10 +118,11 @@ namespace Parallel void destroy(); }; - void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); - void transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache); + void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); + void Sync_ensure_cache(MyList *PatL, int Symmetry, SyncCache &cache); + void transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); struct AsyncSyncState { int req_no; diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 97b3e61..a69786d 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -76,6 +76,48 @@ bool fill_bssn_cuda_views(Block *cg, MyList *vars, return idx == BSSN_CUDA_STATE_COUNT && vars == 0; } +bool bssn_cuda_use_resident_sync(int lev) +{ +#ifdef WithShell + (void)lev; + return false; +#else + return lev == 0; +#endif +} + +void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + double *state_out[BSSN_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views(cg, vars, state_out)) + { + cout << "CUDA BSSN state list mismatch on resident state download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_cuda_download_resident_state(cg, cg->shape, state_out)) + { + cout << "CUDA resident state download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + bssn_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + } // namespace #endif @@ -3058,13 +3100,18 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps #if (PSTR == 0) #if 1 -void bssn_class::Step(int lev, int YN) -{ - setpbh(BH_num, Porg0, Mass, BH_num_input); - - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); - -// new code 2013-2-15, zjcao +void bssn_class::Step(int lev, int YN) +{ + setpbh(BH_num, Porg0, Mass, BH_num_input); + + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#if USE_CUDA_BSSN + const bool use_cuda_resident_sync = bssn_cuda_use_resident_sync(lev); +#else + const bool use_cuda_resident_sync = false; +#endif + +// new code 2013-2-15, zjcao #if (MAPBH == 1) // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -3128,15 +3175,17 @@ void bssn_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); -#endif - +#if (AGM == 0) + if (!use_cuda_resident_sync) + f_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); +#endif + bool used_gpu_substep = false; + bool used_gpu_resident_state = false; #if USE_CUDA_BSSN { double *state_in[BSSN_CUDA_STATE_COUNT]; @@ -3154,6 +3203,11 @@ void bssn_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } int apply_bam_bc = 0; + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif #if (SommerType == 0) #ifndef WithShell apply_bam_bc = (lev == 0) ? 1 : 0; @@ -3164,7 +3218,8 @@ void bssn_class::Step(int lev, int YN) state_in, state_out, matter, propspeed, soa_flat, Pp->data->bbox, dT_lev, TRK4, iter_count, apply_bam_bc, - Symmetry, lev, ndeps, pre)) + Symmetry, lev, ndeps, pre, + keep_resident_state, apply_enforce_ga, chitiny)) { cout << "CUDA predictor substep failed in domain: (" << cg->bbox[0] << ":" << cg->bbox[3] << "," @@ -3173,6 +3228,7 @@ void bssn_class::Step(int lev, int YN) ERROR = 1; } used_gpu_substep = true; + used_gpu_resident_state = (keep_resident_state != 0); } #endif if (!used_gpu_substep) @@ -3277,8 +3333,9 @@ void bssn_class::Step(int lev, int YN) varlrhs = varlrhs->next; } } - f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); - } + if (!used_gpu_resident_state) + f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -3436,9 +3493,9 @@ void bssn_class::Step(int lev, int YN) MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); } #endif - - Parallel::AsyncSyncState async_pre; - Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -3455,9 +3512,9 @@ void bssn_class::Step(int lev, int YN) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } - } -#endif - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + } +#endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); #ifdef WithShell // Complete non-blocking error reduction and check @@ -3530,22 +3587,24 @@ void bssn_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#elif (AGM == 1) - if (iter_count == 3) - f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#endif - +#if (AGM == 0) + if (!use_cuda_resident_sync) + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#elif (AGM == 1) + if (iter_count == 3 && !use_cuda_resident_sync) + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#endif + bool used_gpu_substep = false; + bool used_gpu_resident_state = false; #if USE_CUDA_BSSN { double *state_in[BSSN_CUDA_STATE_COUNT]; @@ -3563,6 +3622,13 @@ void bssn_class::Step(int lev, int YN) MPI_Abort(MPI_COMM_WORLD, 1); } int apply_bam_bc = 0; + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#elif (AGM == 1) + apply_enforce_ga = (iter_count == 3) ? 1 : 0; +#endif #if (SommerType == 0) #ifndef WithShell apply_bam_bc = (lev == 0) ? 1 : 0; @@ -3573,7 +3639,8 @@ void bssn_class::Step(int lev, int YN) state_in, state_out, matter, propspeed, soa_flat, Pp->data->bbox, dT_lev, TRK4, iter_count, apply_bam_bc, - Symmetry, lev, ndeps, cor)) + Symmetry, lev, ndeps, cor, + keep_resident_state, apply_enforce_ga, chitiny)) { cout << "CUDA corrector substep failed in domain: (" << cg->bbox[0] << ":" << cg->bbox[3] << "," @@ -3582,6 +3649,7 @@ void bssn_class::Step(int lev, int YN) ERROR = 1; } used_gpu_substep = true; + used_gpu_resident_state = (keep_resident_state != 0); } #endif if (!used_gpu_substep) @@ -3686,8 +3754,9 @@ void bssn_class::Step(int lev, int YN) varlrhs = varlrhs->next; } } - f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); - } + if (!used_gpu_resident_state) + f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -3842,9 +3911,9 @@ void bssn_class::Step(int lev, int YN) MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } #endif - - Parallel::AsyncSyncState async_cor; - Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -3862,8 +3931,8 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } -#endif - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); +#endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); #ifdef WithShell // Complete non-blocking error reduction and check @@ -3968,10 +4037,14 @@ void bssn_class::Step(int lev, int YN) } #endif } - } -#if (RPS == 0) - // mesh refinement boundary part - RestrictProlong(lev, YN, BB); + } +#if USE_CUDA_BSSN + if (use_cuda_resident_sync) + bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank); +#endif +#if (RPS == 0) + // mesh refinement boundary part + RestrictProlong(lev, YN, BB); #ifdef WithShell if (lev == 0) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 04f2dd0..d0de9ab 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -62,6 +63,72 @@ static void init_gpu_dispatch() { } g_dispatch.inited = true; } + +struct CudaProfileStats { + long long calls; + double total_ms; + double state_ms; + double matter_ms; + double rhs_ms; + double bc_ms; + double finalize_ms; + double output_ms; +}; + +static CudaProfileStats &cuda_profile_stats() { + static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + return stats; +} + +static bool cuda_profile_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static int cuda_profile_every() { + static int every = -1; + if (every < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 100; + } + return every; +} + +static double cuda_profile_now_ms() { + using clock = std::chrono::steady_clock; + return std::chrono::duration( + clock::now().time_since_epoch()).count(); +} + +static void cuda_profile_sync() { + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error %s:%d: %s\n", + __FILE__, __LINE__, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +static void cuda_profile_maybe_log() { + if (!cuda_profile_enabled()) return; + CudaProfileStats &stats = cuda_profile_stats(); + if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return; + fprintf(stderr, + "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n", + g_dispatch.my_rank, g_dispatch.my_device, stats.calls, + stats.total_ms / (double)stats.calls, + stats.state_ms / (double)stats.calls, + stats.matter_ms / (double)stats.calls, + stats.rhs_ms / (double)stats.calls, + stats.bc_ms / (double)stats.calls, + stats.finalize_ms / (double)stats.calls, + stats.output_ms / (double)stats.calls); + fflush(stderr); +} /* ------------------------------------------------------------------ */ /* Error checking */ @@ -248,12 +315,17 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = { struct StepContext { double *d_state0_mem; double *d_accum_mem; + double *d_state_curr_mem; + double *d_state_next_mem; double *d_matter_mem; std::array d_state0; std::array d_accum; + std::array d_state_curr; + std::array d_state_next; std::array d_matter; size_t cap_all; bool matter_ready; + bool state_ready; }; static std::unordered_map g_step_ctx; @@ -321,19 +393,32 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) cudaFree(ctx.d_accum_mem); ctx.d_accum_mem = nullptr; } + if (ctx.d_state_curr_mem) { + cudaFree(ctx.d_state_curr_mem); + ctx.d_state_curr_mem = nullptr; + } + if (ctx.d_state_next_mem) { + cudaFree(ctx.d_state_next_mem); + ctx.d_state_next_mem = nullptr; + } if (ctx.d_matter_mem) { cudaFree(ctx.d_matter_mem); ctx.d_matter_mem = nullptr; } CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); ctx.cap_all = all; ctx.matter_ready = false; + ctx.state_ready = false; } for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; + ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all; + ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all; } for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; @@ -347,6 +432,8 @@ static void release_step_ctx(void *block_tag) if (it == g_step_ctx.end()) return; if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem); if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem); + if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem); + if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem); if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem); g_step_ctx.erase(it); } @@ -1050,6 +1137,76 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0, } } +__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, + double * __restrict__ gxy, + double * __restrict__ gxz, + double * __restrict__ dyy, + double * __restrict__ gyz, + double * __restrict__ dzz, + double * __restrict__ Axx, + double * __restrict__ Axy, + double * __restrict__ Axz, + double * __restrict__ Ayy, + double * __restrict__ Ayz, + double * __restrict__ Azz) +{ + constexpr double F1O3 = 1.0 / 3.0; + constexpr double ONE = 1.0; + constexpr double TWO = 2.0; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + double lgxx = dxx[i] + ONE; + double lgyy = dyy[i] + ONE; + double lgzz = dzz[i] + ONE; + double lgxy = gxy[i]; + double lgxz = gxz[i]; + double lgyz = gyz[i]; + + double lscale = lgxx * lgyy * lgzz + + lgxy * lgyz * lgxz + + lgxz * lgxy * lgyz + - lgxz * lgyy * lgxz + - lgxy * lgxy * lgzz + - lgxx * lgyz * lgyz; + + lscale = ONE / cbrt(lscale); + + lgxx *= lscale; + lgxy *= lscale; + lgxz *= lscale; + lgyy *= lscale; + lgyz *= lscale; + lgzz *= lscale; + + dxx[i] = lgxx - ONE; + gxy[i] = lgxy; + gxz[i] = lgxz; + dyy[i] = lgyy - ONE; + gyz[i] = lgyz; + dzz[i] = lgzz - ONE; + + const double lgupxx = (lgyy * lgzz - lgyz * lgyz); + const double lgupxy = - (lgxy * lgzz - lgyz * lgxz); + const double lgupxz = (lgxy * lgyz - lgyy * lgxz); + const double lgupyy = (lgxx * lgzz - lgxz * lgxz); + const double lgupyz = - (lgxx * lgyz - lgxy * lgxz); + const double lgupzz = (lgxx * lgyy - lgxy * lgxy); + + const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i] + + TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]); + + Axx[i] -= F1O3 * lgxx * ltrA; + Axy[i] -= F1O3 * lgxy * ltrA; + Axz[i] -= F1O3 * lgxz * ltrA; + Ayy[i] -= F1O3 * lgyy * ltrA; + Ayz[i] -= F1O3 * lgyz * ltrA; + Azz[i] -= F1O3 * lgzz * ltrA; + } +} + __global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -2429,6 +2586,20 @@ static void bind_matter_slots(const StepContext &ctx) } } +static void bind_state_input_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + g_buf.slot[k_state_input_slots[i]] = state[i]; + } +} + +static void bind_state_output_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + g_buf.slot[k_state_rhs_slots[i]] = state[i]; + } +} + static void launch_rhs_pipeline(int all, double eps, int co) { const double SYM = 1.0; @@ -2763,6 +2934,87 @@ static void download_state_outputs(double **state_host_out, size_t all) } } +static void copy_state_region_cuda(void *block_tag, + int state_index, + double *host_state, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + const size_t pitch = (size_t)ex[0] * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + + cudaMemcpy3DParms p = {}; + p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); + p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + + if (kind == cudaMemcpyDeviceToHost) { + p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); + } else { + p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + } + CUDA_CHECK(cudaMemcpy3D(&p)); +} + +static void copy_state_region_packed_cuda(void *block_tag, + int state_index, + double *host_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + const size_t src_pitch = (size_t)ex[0] * sizeof(double); + const size_t dst_pitch = (size_t)sx * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + + cudaMemcpy3DParms p = {}; + p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); + + if (kind == cudaMemcpyDeviceToHost) { + p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); + p.dstPos = make_cudaPos(0, 0, 0); + } else { + p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); + p.srcPos = make_cudaPos(0, 0, 0); + p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + } + + CUDA_CHECK(cudaMemcpy3D(&p)); +} + +static void download_resident_state(void *block_tag, int *ex, double **state_host_out) +{ + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, all); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + +static bool has_resident_state(void *block_tag) +{ + auto it = g_step_ctx.find(block_tag); + return it != g_step_ctx.end() && it->second.state_ready; +} + /* ================================================================== */ /* Main host function — drop-in replacement for bssn_rhs_c.C */ /* ================================================================== */ @@ -3266,22 +3518,53 @@ int bssn_cuda_rk4_substep(void *block_tag, int &Symmetry, int &Lev, double &eps, - int &co) + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny) { (void)T; - (void)Lev; + (void)state_host_out; if (RK4 < 0 || RK4 > 3) return 1; init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + const bool profile = cuda_profile_enabled(); + const double t_total0 = profile ? cuda_profile_now_ms() : 0.0; + double state_ms = 0.0; + double matter_ms = 0.0; + double rhs_ms = 0.0; + double bc_ms = 0.0; + double finalize_ms = 0.0; + double output_ms = 0.0; const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); StepContext &ctx = ensure_step_ctx(block_tag, all); - upload_state_inputs(state_host_in, all); + const bool use_resident_state = (keep_resident_state != 0); + if (use_resident_state) { + bind_state_input_slots(ctx.d_state_curr); + bind_state_output_slots(ctx.d_state_next); + } + double t0 = profile ? cuda_profile_now_ms() : 0.0; + if (!use_resident_state || RK4 == 0 || !ctx.state_ready) { + upload_state_inputs(state_host_in, all); + } + if (apply_enforce_ga) { + kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], + g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], + g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], + g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + } + if (profile) { + cuda_profile_sync(); + state_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; if (RK4 == 0) { upload_matter_cache(ctx, matter_host, all); CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], @@ -3291,9 +3574,19 @@ int bssn_cuda_rk4_substep(void *block_tag, upload_matter_cache(ctx, matter_host, all); } bind_matter_slots(ctx); + if (profile) { + cuda_profile_sync(); + matter_ms += cuda_profile_now_ms() - t0; + } + t0 = profile ? cuda_profile_now_ms() : 0.0; launch_rhs_pipeline((int)all, eps, co); + if (profile) { + cuda_profile_sync(); + rhs_ms += cuda_profile_now_ms() - t0; + } + t0 = profile ? cuda_profile_now_ms() : 0.0; if (apply_bam_bc) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]], @@ -3305,7 +3598,12 @@ int bssn_cuda_rk4_substep(void *block_tag, X, Y, Z, bbox, Symmetry); } } + if (profile) { + cuda_profile_sync(); + bc_ms += cuda_profile_now_ms() - t0; + } + t0 = profile ? cuda_profile_now_ms() : 0.0; for (int i = 0; i < BSSN_STATE_COUNT; ++i) { kern_rk4_finalize<<>>(ctx.d_state0[i], g_buf.slot[k_state_rhs_slots[i]], @@ -3314,13 +3612,119 @@ int bssn_cuda_rk4_substep(void *block_tag, RK4); } - download_state_outputs(state_host_out, all); - if (RK4 == 3) { + kern_lowerboundset_cuda<<>>(g_buf.slot[S_chi_rhs], chitiny); + if (profile) { + cuda_profile_sync(); + finalize_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (use_resident_state) { + std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem); + ctx.d_state_curr.swap(ctx.d_state_next); + ctx.state_ready = true; + } else { + download_state_outputs(state_host_out, all); + } + if (RK4 == 3 && !use_resident_state) { release_step_ctx(block_tag); } + if (profile) { + cuda_profile_sync(); + output_ms += cuda_profile_now_ms() - t0; + CudaProfileStats &stats = cuda_profile_stats(); + stats.calls++; + stats.total_ms += cuda_profile_now_ms() - t_total0; + stats.state_ms += state_ms; + stats.matter_ms += matter_ms; + stats.rhs_ms += rhs_ms; + stats.bc_ms += bc_ms; + stats.finalize_ms += finalize_ms; + stats.output_ms += output_ms; + cuda_profile_maybe_log(); + } return 0; } +extern "C" +int bssn_cuda_copy_state_region_to_host(void *block_tag, + int state_index, + double *host_state, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_cuda(block_tag, state_index, host_state, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" +int bssn_cuda_copy_state_region_from_host(void *block_tag, + int state_index, + double *host_state, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_cuda(block_tag, state_index, host_state, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice); + return 0; +} + +extern "C" +int bssn_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + download_resident_state(block_tag, ex, state_host_out); + return 0; +} + +extern "C" +int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" +int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, + i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice); + return 0; +} + +extern "C" +int bssn_cuda_has_resident_state(void *block_tag) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + return has_resident_state(block_tag) ? 1 : 0; +} + extern "C" void bssn_cuda_release_step_ctx(void *block_tag) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 4d3a41f..8d12c40 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -49,7 +49,44 @@ int bssn_cuda_rk4_substep(void *block_tag, int &Symmetry, int &Lev, double &eps, - int &co); + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny); + +int bssn_cuda_copy_state_region_to_host(void *block_tag, + int state_index, + double *host_state, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_copy_state_region_from_host(void *block_tag, + int state_index, + double *host_state, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out); + +int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int bssn_cuda_has_resident_state(void *block_tag); void bssn_cuda_release_step_ctx(void *block_tag);