diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index b87ce6d..59db417 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1,9 +1,115 @@ #include "Parallel.h" -#include "fmisc.h" -#include "prolongrestrict.h" -#include "misc.h" -#include "parameters.h" +#include "fmisc.h" +#include "prolongrestrict.h" +#include "misc.h" +#include "parameters.h" +#include + +namespace { + +const char *g_parallel_transfer_context = "Parallel::transfer"; +int parallel_next_transfer_tag() +{ + const char *ctx = g_parallel_transfer_context ? g_parallel_transfer_context : "Parallel::transfer"; + unsigned int hash = 5381u; + while (*ctx) + { + hash = ((hash << 5) + hash) + static_cast(*ctx); + ctx++; + } + return 32 + static_cast(hash % 32000u); +} + +struct ParallelTransferContextGuard +{ + const char *prev; + explicit ParallelTransferContextGuard(const char *context) : prev(g_parallel_transfer_context) + { + g_parallel_transfer_context = context; + } + ~ParallelTransferContextGuard() + { + g_parallel_transfer_context = prev; + } +}; + +void parallel_report_mpi_error(const char *context, int errcode, int req_no) +{ + char errstr[MPI_MAX_ERROR_STRING]; + int len = 0; + MPI_Error_string(errcode, errstr, &len); + int myrank = -1; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + fprintf(stderr, "MPI failure on rank %d in %s (transfer_ctx=%s, req_no=%d): %.*s\n", + myrank, context, g_parallel_transfer_context, req_no, len, errstr); + fflush(stderr); +} + +void parallel_report_mpi_status_errors(const char *context, int req_no, + int status_count, MPI_Status *stats) +{ + if (!stats) + return; + + int myrank = -1; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + const int count = (status_count >= 0 && status_count <= req_no) ? status_count : req_no; + for (int i = 0; i < count; i++) + { + if (stats[i].MPI_ERROR != MPI_SUCCESS) + { + char errstr[MPI_MAX_ERROR_STRING]; + int len = 0; + MPI_Error_string(stats[i].MPI_ERROR, errstr, &len); + fprintf(stderr, + "MPI request failure on rank %d in %s (transfer_ctx=%s, status_idx=%d, source=%d, tag=%d): %.*s\n", + myrank, context, g_parallel_transfer_context, i, + stats[i].MPI_SOURCE, stats[i].MPI_TAG, len, errstr); + } + } + fflush(stderr); +} + +int parallel_waitsome_checked(int req_no, MPI_Request *reqs, int *outcount, + int *completed, MPI_Status *stats, + const char *context) +{ + MPI_Errhandler old_handler; + MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler); + MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); + int rc = MPI_Waitsome(req_no, reqs, outcount, completed, stats); + MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler); + MPI_Errhandler_free(&old_handler); + if (rc != MPI_SUCCESS) + { + parallel_report_mpi_status_errors(context, req_no, outcount ? *outcount : req_no, stats); + parallel_report_mpi_error(context, rc, req_no); + MPI_Abort(MPI_COMM_WORLD, 1); + } + return rc; +} + +int parallel_waitall_checked(int req_no, MPI_Request *reqs, MPI_Status *stats, + const char *context) +{ + MPI_Errhandler old_handler; + MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler); + MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); + int rc = MPI_Waitall(req_no, reqs, stats); + MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler); + MPI_Errhandler_free(&old_handler); + if (rc != MPI_SUCCESS) + { + parallel_report_mpi_status_errors(context, req_no, req_no, stats); + parallel_report_mpi_error(context, rc, req_no); + MPI_Abort(MPI_COMM_WORLD, 1); + } + return rc; +} + +} // namespace int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion { @@ -3896,10 +4002,11 @@ void Parallel::transfer(MyList **src, MyList **src, MyList 0) - { + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { rec_data[node] = new double[recv_lengths[node]]; if (!rec_data[node]) { cout << "out of memory when new in short transfer, place 1" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 1; req_no++; pending_recv++; } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { rec_data[myrank] = new double[recv_lengths[myrank]]; if (!rec_data[myrank]) { @@ -3947,33 +4054,33 @@ void Parallel::transfer(MyList **src, MyList 0) - { - send_data[node] = new double[send_lengths[node]]; + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; if (!send_data[node]) { cout << "out of memory when new in short transfer, place 3" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) + req_no++; + } + } + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) { int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats, + "Parallel::transfer"); if (outcount == MPI_UNDEFINED) break; for (int i = 0; i < outcount; i++) @@ -3988,7 +4095,7 @@ void Parallel::transfer(MyList **src, MyList 0) MPI_Waitall(req_no, reqs, stats); + if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfer"); if (rec_data[myrank]) data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); @@ -4005,12 +4112,12 @@ void Parallel::transfer(MyList **src, MyList **src, MyList **dst, MyList *VarList1 /* source */, MyList *VarList2 /*target */, @@ -4027,8 +4134,9 @@ void Parallel::transfermix(MyList **src, MyList **src, MyList **src, MyList **src, MyList 0) { int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats, + "Parallel::transfermix"); if (outcount == MPI_UNDEFINED) break; for (int i = 0; i < outcount; i++) @@ -4117,7 +4226,7 @@ void Parallel::transfermix(MyList **src, MyList 0) MPI_Waitall(req_no, reqs, stats); + if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfermix"); if (rec_data[myrank]) data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); @@ -4134,16 +4243,21 @@ void Parallel::transfermix(MyList **src, MyList *VarList, int Symmetry) -{ - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} +void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) +{ + Sync(Pat, VarList, Symmetry, "Parallel::Sync(Patch)"); +} + +void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry, const char *context) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); MyList *dst; MyList **src, **transfer_src, **transfer_dst; @@ -4159,7 +4273,10 @@ void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) // but for transfer_dst[node] the data may locate on any node } - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync(Patch)"); + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } if (dst) dst->destroyList(); @@ -4177,15 +4294,26 @@ void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) delete[] transfer_src; delete[] transfer_dst; } -void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) -{ - // Patch inner Synch - MyList *Pp = PatL; - while (Pp) - { - Sync(Pp->data, VarList, Symmetry); - Pp = Pp->next; - } +void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) +{ + Sync(PatL, VarList, Symmetry, "Parallel::Sync(PatchList)"); +} + +void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry, const char *context) +{ + std::string patchlist_context = context ? std::string(context) + " [patchlist]" : "Parallel::Sync(PatchList)"; + + // Patch inner Synch + MyList *Pp = PatL; + int patch_index = 0; + while (Pp) + { + std::string patch_context = context ? std::string(context) + " [patch " + std::to_string(patch_index) + "]" + : "Parallel::Sync(Patch)"; + Sync(Pp->data, VarList, Symmetry, patch_context.c_str()); + Pp = Pp->next; + patch_index++; + } // Patch inter Synch int cpusize; @@ -4204,7 +4332,10 @@ void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx(patchlist_context.c_str()); + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } if (dst) dst->destroyList(); @@ -4224,10 +4355,15 @@ void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) } // Merged Sync: collect all intra-patch and inter-patch grid segment lists, // then issue a single transfer() call instead of N+1 separate ones. -void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) -{ - int cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); +void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) +{ + Sync_merged(PatL, VarList, Symmetry, "Parallel::Sync_merged"); +} + +void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry, const char *context) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); MyList **combined_src = new MyList *[cpusize]; MyList **combined_dst = new MyList *[cpusize]; @@ -4301,8 +4437,11 @@ void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmet if (dst_buffer) dst_buffer->destroyList(); - // Phase C: Single transfer - transfer(combined_src, combined_dst, VarList, VarList, Symmetry); + // Phase C: Single transfer + { + ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync_merged"); + transfer(combined_src, combined_dst, VarList, VarList, Symmetry); + } // Phase D: Cleanup for (int node = 0; node < cpusize; node++) @@ -4380,8 +4519,9 @@ void Parallel::transfer_cached(MyList **src, MyList **src, MyList **src, MyList **src, MyList 0) { int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats, + "Parallel::transfer_cached"); if (outcount == MPI_UNDEFINED) break; for (int i = 0; i < outcount; i++) @@ -4466,11 +4607,11 @@ void Parallel::transfer_cached(MyList **src, MyList 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfer_cached"); - if (self_len > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); -} + if (self_len > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); +} void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { if (!cache.valid) @@ -4669,12 +4810,13 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr } // Now pack and post async MPI operations - int myrank; - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int cpusize = cache.cpusize; - state.req_no = 0; - state.active = true; - state.pending_recv = 0; + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + state.req_no = 0; + state.active = true; + state.mpi_tag = parallel_next_transfer_tag(); + state.pending_recv = 0; // Allocate tracking arrays delete[] state.req_node; delete[] state.req_is_recv; state.req_node = new int[cache.max_reqs]; @@ -4725,7 +4867,7 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); state.req_node[state.req_no] = node; state.req_is_recv[state.req_no] = 0; - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++); } int rlength; if (!cache.lengths_valid) { @@ -4745,7 +4887,7 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr state.req_node[state.req_no] = node; state.req_is_recv[state.req_no] = 1; state.pending_recv++; - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++); } } } @@ -4775,7 +4917,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, while (pending > 0) { int outcount = 0; - MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats); + parallel_waitsome_checked(state.req_no, cache.reqs, &outcount, completed, cache.stats, + "Parallel::Sync_finish"); if (outcount == MPI_UNDEFINED) break; for (int i = 0; i < outcount; i++) { @@ -4791,8 +4934,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, delete[] completed; } - // Wait for remaining sends - if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats); + // Wait for remaining sends + if (state.req_no > 0) parallel_waitall_checked(state.req_no, cache.reqs, cache.stats, "Parallel::Sync_finish"); delete[] state.req_node; state.req_node = 0; delete[] state.req_is_recv; state.req_is_recv = 0; @@ -5235,7 +5378,10 @@ void Parallel::PeriodicBD(Patch *Pat, MyList *VarList, int Symmetry) build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::PeriodicBD"); + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } if (dst) dst->destroyList(); @@ -5506,7 +5652,10 @@ void Parallel::Prolong(Patch *Patc, Patch *Patf, build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Prolong"); + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + } if (dst) dst->destroyList(); @@ -5565,7 +5714,10 @@ void Parallel::Restrict(MyList *PatcL, MyList *PatfL, build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Restrict"); + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + } if (dst) dst->destroyList(); @@ -5609,7 +5761,10 @@ void Parallel::Restrict_after(MyList *PatcL, MyList *PatfL, build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Restrict_after"); + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + } if (dst) dst->destroyList(); @@ -5654,7 +5809,10 @@ void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf, build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(Patch)"); + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + } if (dst) dst->destroyList(); @@ -5710,7 +5868,10 @@ void Parallel::OutBdLow2Hi(MyList *PatcL, MyList *PatfL, build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(PatchList)"); + transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry); + } if (dst) dst->destroyList(); @@ -5981,13 +6142,14 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int cpusize = cache.cpusize; - - int req_no = 0; - int pending_recv = 0; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; + int cpusize = cache.cpusize; + + int req_no = 0; + int pending_recv = 0; + const int mpi_tag = parallel_next_transfer_tag(); + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; // Post receives first so peers can progress rendezvous early. for (int node = 0; node < cpusize; node++) @@ -6004,7 +6166,7 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.recv_bufs[node] = new double[rlength]; cache.recv_buf_caps[node] = rlength; } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 1; req_no++; @@ -6042,7 +6204,7 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.send_buf_caps[node] = slength; } data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no); req_node[req_no] = node; req_is_recv[req_no] = 0; req_no++; @@ -6053,7 +6215,8 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, while (pending_recv > 0) { int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats, + "Parallel::transfermix_cached"); if (outcount == MPI_UNDEFINED) break; for (int i = 0; i < outcount; i++) @@ -6068,10 +6231,10 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, } } - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfermix_cached"); - if (self_len > 0) - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + if (self_len > 0) + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); delete[] req_node; delete[] req_is_recv; @@ -6787,7 +6950,10 @@ void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyLis build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node } - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Interp copy"); + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } for (int node = 0; node < cpusize; node++) { @@ -6826,7 +6992,10 @@ void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyLis Sync(PatcL, FutureList, Symmetry); } //<~~~prolong then - transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong FutureList"); + transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry); + } // for StateList // time interpolation part @@ -6842,7 +7011,10 @@ void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyLis Sync(PatcL, tmList, Symmetry); } //<~~~prolong then - transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong StateList"); + transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry); + } } else { @@ -6853,7 +7025,10 @@ void Parallel::fill_level_data(MyList *PatLd, MyList *PatLs, MyLis Sync(PatcL, VarList, Symmetry); } //<~~~prolong then - transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + { + ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong VarList"); + transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry); + } } for (int node = 0; node < cpusize; node++) diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 0ab975c..f115af5 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -89,9 +89,12 @@ namespace Parallel void transfermix(MyList **src, MyList **dst, MyList *VarList1 /* source */, MyList *VarList2 /*target */, int Symmetry); - void Sync(Patch *Pat, MyList *VarList, int Symmetry); - void Sync(MyList *PatL, MyList *VarList, int Symmetry); - void Sync_merged(MyList *PatL, MyList *VarList, int Symmetry); + void Sync(Patch *Pat, MyList *VarList, int Symmetry); + void Sync(Patch *Pat, MyList *VarList, int Symmetry, const char *context); + void Sync(MyList *PatL, MyList *VarList, int Symmetry); + void Sync(MyList *PatL, MyList *VarList, int Symmetry, const char *context); + void Sync_merged(MyList *PatL, MyList *VarList, int Symmetry); + void Sync_merged(MyList *PatL, MyList *VarList, int Symmetry, const char *context); struct SyncCache { bool valid; @@ -121,14 +124,15 @@ namespace Parallel MyList *VarList1, MyList *VarList2, int Symmetry, SyncCache &cache); - struct AsyncSyncState { - int req_no; - bool active; - int *req_node; - int *req_is_recv; - int pending_recv; - AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {} - }; + struct AsyncSyncState { + int req_no; + bool active; + int mpi_tag; + int *req_node; + int *req_is_recv; + int pending_recv; + AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {} + }; void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache, AsyncSyncState &state); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 179233e..9e085be 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -14,7 +14,8 @@ using namespace std; #include #endif -#include +#include +#include #include "macrodef.h" #include "misc.h" @@ -2036,9 +2037,10 @@ void bssn_class::Read_Ansorg() void bssn_class::Evolve(int Steps) { - clock_t prev_clock, curr_clock; - double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0; - LastAnas = 0; + clock_t prev_clock, curr_clock; + double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0; + LastAnas = 0; + LastConsOut = 0; #if 0 //initial checkpoint for special uasge { @@ -2296,18 +2298,21 @@ void bssn_class::Evolve(int Steps) //////////////////////////////////////////////////////////// // When LastCheck >= CheckTime, perform runtime checks and output status data - if (LastCheck >= CheckTime) - { - LastCheck = 0; - - CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); + if (LastCheck >= CheckTime) + { + LastCheck = 0; + + CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); CheckPoint->writecheck_cgh(PhysTime, GH); #ifdef WithShell CheckPoint->writecheck_sh(PhysTime, SH); -#endif - CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); - } - } +#endif + CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); + } + + // Keep output/analysis phases aligned across ranks before the next coarse step. + MPI_Barrier(MPI_COMM_WORLD); + } /* #ifdef With_AHF // final apparent horizon finding @@ -6253,7 +6258,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--) for(int ilev=GH->levels-1;ilev>lev;ilev--) RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List); #else - Parallel::Sync(GH->PatL[lev], DG_List, Symmetry); + Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4"); #endif #ifdef WithShell @@ -6908,10 +6913,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) { LastAnas += dT_lev; - if (LastAnas >= AnasTime) - { -#ifdef Point_Psi4 -#error "not support parallel levels yet" + if (LastAnas >= AnasTime) + { +#ifdef Point_Psi4 +#error "not support parallel levels yet" // Gam_ijk and R_ij have been calculated in Interp_Constraint() double SYM = 1, ANT = -1; for (int levh = lev; levh < GH->levels; levh++) @@ -7255,9 +7260,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) //================================================================================================ -void bssn_class::Constraint_Out() -{ - LastConsOut += dT * pow(0.5, Mymax(0, trfls)); +void bssn_class::Constraint_Out() +{ + LastConsOut += dT * pow(0.5, Mymax(0, trfls)); if (LastConsOut >= AnasTime) // Constraint violation @@ -7322,7 +7327,7 @@ void bssn_class::Constraint_Out() Pp = Pp->next; } } - Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); + Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]"); } #ifdef WithShell if (0) // if the constrait quantities can be reused from the step rhs calculation @@ -7544,7 +7549,7 @@ void bssn_class::AH_Prepare_derivatives() } Pp = Pp->next; } - Parallel::Sync(GH->PatL[lev], AHDList, Symmetry); + Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives"); } } @@ -7825,7 +7830,7 @@ void bssn_class::Interp_Constraint(bool infg) Pp = Pp->next; } } - Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); + Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]"); } #ifdef WithShell if (0) // if the constrait quantities can be reused from the step rhs calculation @@ -8083,7 +8088,7 @@ void bssn_class::Compute_Constraint() Pp = Pp->next; } } - Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry); + Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]"); } // prolong restrict constraint quantities for (lev = GH->levels - 1; lev > 0; lev--) @@ -8396,12 +8401,18 @@ void bssn_class::Enforce_algcon(int lev, int fg) //================================================================================================ -bool bssn_class::check_Stdin_Abort() -{ - - fd_set readfds; - - struct timeval timeout; +bool bssn_class::check_Stdin_Abort() +{ + // Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin) + // should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can + // fail when rank 0 polls/consumes it. + if (!isatty(STDIN_FILENO)) { + return false; + } + + fd_set readfds; + + struct timeval timeout; FD_ZERO(&readfds); FD_SET(STDIN_FILENO, &readfds); @@ -8410,14 +8421,17 @@ bool bssn_class::check_Stdin_Abort() timeout.tv_sec = 0; timeout.tv_usec = 0; - int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout); - - if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { - string input_abort; - if (cin >> input_abort) { - if (input_abort == "stop") { - return true; - } + int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout); + if (activity <= 0) { + return false; + } + + if (FD_ISSET(STDIN_FILENO, &readfds)) { + string input_abort; + if (cin >> input_abort) { + if (input_abort == "stop") { + return true; + } } } diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index b4f95ae..3654959 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -1,4 +1,5 @@ #include "bssn_cuda_ops.h" +#include "bssn_gpu.h" #include #include @@ -79,6 +80,29 @@ inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes) return true; } +inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes) +{ + if (!ensure_capacity(dst, bytes)) + return false; + + const double *device_src = bssn_gpu_find_device_buffer(src); + cudaMemcpyKind kind = cudaMemcpyHostToDevice; + const void *copy_src = src; + if (device_src) + { + copy_src = device_src; + kind = cudaMemcpyDeviceToDevice; + } + + cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind); + if (err != cudaSuccess) + { + report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err); + return false; + } + return true; +} + __global__ void enforce_ga_kernel(int n, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, @@ -374,7 +398,7 @@ __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz, } } -inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args) +inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args) { cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr); if (err != cudaSuccess) @@ -382,10 +406,10 @@ inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **ar report_cuda_error("cudaLaunchKernel", err); return false; } - err = cudaDeviceSynchronize(); + err = cudaPeekAtLastError(); if (err != cudaSuccess) { - report_cuda_error("cudaDeviceSynchronize", err); + report_cuda_error("cudaPeekAtLastError", err); return false; } return true; @@ -411,18 +435,18 @@ int bssn_cuda_enforce_ga(int *ex, dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - bool ok = copy_to_device(cache.dxx, dxx, bytes) && - copy_to_device(cache.gxy, gxy, bytes) && - copy_to_device(cache.gxz, gxz, bytes) && - copy_to_device(cache.dyy, dyy, bytes) && - copy_to_device(cache.gyz, gyz, bytes) && - copy_to_device(cache.dzz, dzz, bytes) && - copy_to_device(cache.Axx, Axx, bytes) && - copy_to_device(cache.Axy, Axy, bytes) && - copy_to_device(cache.Axz, Axz, bytes) && - copy_to_device(cache.Ayy, Ayy, bytes) && - copy_to_device(cache.Ayz, Ayz, bytes) && - copy_to_device(cache.Azz, Azz, bytes); + bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) && + copy_to_device_preferring_device(cache.gxy, gxy, bytes) && + copy_to_device_preferring_device(cache.gxz, gxz, bytes) && + copy_to_device_preferring_device(cache.dyy, dyy, bytes) && + copy_to_device_preferring_device(cache.gyz, gyz, bytes) && + copy_to_device_preferring_device(cache.dzz, dzz, bytes) && + copy_to_device_preferring_device(cache.Axx, Axx, bytes) && + copy_to_device_preferring_device(cache.Axy, Axy, bytes) && + copy_to_device_preferring_device(cache.Axz, Axz, bytes) && + copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) && + copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) && + copy_to_device_preferring_device(cache.Azz, Azz, bytes); if (ok) { @@ -432,7 +456,7 @@ int bssn_cuda_enforce_ga(int *ex, double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr; void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz, &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz}; - ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args); + ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args); } if (ok) @@ -527,10 +551,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum; ok = ok && - (!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) && + (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) && (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) && - (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) && - (!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes)); + (!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) && + (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes)); if (ok && !need_stage_input) ok = ensure_capacity(cache.stage, bytes); @@ -539,11 +563,15 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, return 1; if (refresh_state0) + { cache.host_state0 = state0; + bssn_gpu_register_device_buffer(state0, cache.state0.ptr); + } if (refresh_rhs) { cache.host_rhs = rhs_accum; cache.rhs_resident = true; + bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr); } double dX = X[1] - X[0]; @@ -582,14 +610,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, &bam_source, &bam_target, &soa0, &soa1, &soa2, &symmetry}; - ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args); + ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args); } if (ok) { double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; - ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args); + ok = launch_kernel(grid, block, (const void *)rk4_kernel, args); } if (ok && lev > 0) @@ -599,11 +627,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, &has_xmin, &has_ymin, &has_zmin, &has_xmax, &has_ymax, &has_zmax, &d_state0, &d_stage}; - ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args); + ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args); } if (ok) { + bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr); + cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; @@ -621,13 +651,13 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - bool ok = copy_to_device(d_chi, chi, bytes); + bool ok = copy_to_device_preferring_device(d_chi, chi, bytes); if (ok) { double *ptr = d_chi.ptr; void *args[] = {&n, &ptr, &tinny}; - ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args); + ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args); } if (ok) diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index e713a83..95b1879 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -17,6 +17,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) #error "Step_MainPath_GPU currently supports Patch grids only." #endif + if (bssn_gpu_bind_process_device(myrank)) + { + cerr << "GPU device bind failure on MPI rank " << myrank << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + bssn_gpu_clear_cached_device_buffers(); + setpbh(BH_num, Porg0, Mass, BH_num_input); const double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); @@ -150,6 +157,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + bssn_gpu_clear_cached_device_buffers(); MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); if (ERROR) @@ -265,6 +273,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + bssn_gpu_clear_cached_device_buffers(); MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) @@ -334,6 +343,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) RestrictProlong(lev, YN, BB); #endif + bssn_gpu_clear_cached_device_buffers(); + Pp = GH->PatL[lev]; while (Pp) { diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 1db2bfe..72aecdf 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -135,6 +135,18 @@ struct GpuRhsCache const double *last_y = nullptr; const double *last_z = nullptr; bool meta_uploaded = false; + static const int max_mapped_buffers = 128; + const double *host_buffers[max_mapped_buffers] = {nullptr}; + const double *device_buffers[max_mapped_buffers] = {nullptr}; + int mapped_buffer_count = 0; +}; + +struct ExternalBufferRegistry +{ + static const int max_mapped_buffers = 256; + const double *host_buffers[max_mapped_buffers] = {nullptr}; + const double *device_buffers[max_mapped_buffers] = {nullptr}; + int mapped_buffer_count = 0; }; GpuRhsCache &gpu_rhs_cache() @@ -143,11 +155,81 @@ GpuRhsCache &gpu_rhs_cache() return cache; } +ExternalBufferRegistry &external_buffer_registry() +{ + static thread_local ExternalBufferRegistry registry; + return registry; +} + void reset_meta(Meta *meta) { memset(meta, 0, sizeof(Meta)); } +void reset_buffer_map(GpuRhsCache &cache) +{ + cache.mapped_buffer_count = 0; + for (int i = 0; i < GpuRhsCache::max_mapped_buffers; ++i) + { + cache.host_buffers[i] = nullptr; + cache.device_buffers[i] = nullptr; + } +} + +void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device_ptr) +{ + if (!host_ptr || !device_ptr) + return; + + for (int i = 0; i < cache.mapped_buffer_count; ++i) + { + if (cache.host_buffers[i] == host_ptr) + { + cache.device_buffers[i] = device_ptr; + return; + } + } + + if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers) + return; + + cache.host_buffers[cache.mapped_buffer_count] = host_ptr; + cache.device_buffers[cache.mapped_buffer_count] = device_ptr; + cache.mapped_buffer_count++; +} + +void reset_external_buffer_map(ExternalBufferRegistry ®istry) +{ + registry.mapped_buffer_count = 0; + for (int i = 0; i < ExternalBufferRegistry::max_mapped_buffers; ++i) + { + registry.host_buffers[i] = nullptr; + registry.device_buffers[i] = nullptr; + } +} + +void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_ptr, const double *device_ptr) +{ + if (!host_ptr || !device_ptr) + return; + + for (int i = 0; i < registry.mapped_buffer_count; ++i) + { + if (registry.host_buffers[i] == host_ptr) + { + registry.device_buffers[i] = device_ptr; + return; + } + } + + if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers) + return; + + registry.host_buffers[registry.mapped_buffer_count] = host_ptr; + registry.device_buffers[registry.mapped_buffer_count] = device_ptr; + registry.mapped_buffer_count++; +} + bool ensure_device_buffer(double **ptr, size_t count) { if (*ptr) @@ -188,6 +270,45 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count) return true; } +const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr) +{ + if (!host_ptr) + return nullptr; + + for (int i = 0; i < registry.mapped_buffer_count; ++i) + { + if (registry.host_buffers[i] == host_ptr) + return registry.device_buffers[i]; + } + return nullptr; +} + +bool copy_buffers_to_device_preferring_device(const CopySpec *specs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + const double *device_src = bssn_gpu_find_device_buffer(specs[i].src); + cudaMemcpyKind kind = cudaMemcpyHostToDevice; + const void *copy_src = specs[i].src; + if (device_src) + { + copy_src = device_src; + kind = cudaMemcpyDeviceToDevice; + } + + cudaError_t err = cudaMemcpy(specs[i].dst, copy_src, + specs[i].count * sizeof(double), + kind); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy(" << (kind == cudaMemcpyDeviceToDevice ? "D2D" : "H2D") + << ") failed: " << cudaGetErrorString(err) << endl; + return false; + } + } + return true; +} + bool zero_buffers(const ZeroSpec *specs, size_t count) { for (size_t i = 0; i < count; ++i) @@ -219,6 +340,8 @@ void cleanup_gpu_rhs_cache() cache.last_x = nullptr; cache.last_y = nullptr; cache.last_z = nullptr; + reset_buffer_map(cache); + reset_external_buffer_map(external_buffer_registry()); } bool register_gpu_rhs_cleanup() @@ -285,6 +408,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) cache.last_y = nullptr; cache.last_z = nullptr; cache.meta_uploaded = false; + reset_buffer_map(cache); Meta *meta = &cache.meta; const int matrix_size = cache.matrix_size; @@ -467,6 +591,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) cache.last_x = nullptr; cache.last_y = nullptr; cache.last_z = nullptr; + reset_buffer_map(cache); return false; } @@ -491,8 +616,56 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) return true; } +const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *host_ptr) +{ + if (!host_ptr) + return nullptr; + + for (int i = 0; i < cache.mapped_buffer_count; ++i) + { + if (cache.host_buffers[i] == host_ptr) + return cache.device_buffers[i]; + } + return nullptr; +} + } // namespace +const double *bssn_gpu_find_device_buffer(const double *host_ptr) +{ + const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr); + if (device_ptr) + return device_ptr; + return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr); +} + +int bssn_gpu_bind_process_device(int mpi_rank) +{ + const int device = select_cuda_device_for_process(mpi_rank); + if (device < 0) + return 1; + + cudaError_t err = cudaSetDevice(device); + if (err != cudaSuccess) + { + cerr << "cudaSetDevice(" << device << ") failed: " + << cudaGetErrorString(err) << endl; + return 1; + } + return 0; +} + +void bssn_gpu_clear_cached_device_buffers() +{ + reset_external_buffer_map(external_buffer_registry()); + reset_buffer_map(gpu_rhs_cache()); +} + +void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr) +{ + map_external_buffer(external_buffer_registry(), host_ptr, device_ptr); +} + __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) @@ -739,16 +912,12 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub } #endif //ifdef Cell #endif //ifdef Vertex -inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){ - sub_symmetry_bd_partF<<>>(ord,func,funcc); - cudaThreadSynchronize(); - sub_symmetry_bd_partI<<>>(ord,func,funcc,SoA[0]); - cudaThreadSynchronize(); - sub_symmetry_bd_partJ<<>>(ord,func,funcc,SoA[1]); - cudaThreadSynchronize(); - sub_symmetry_bd_partK<<>>(ord,func,funcc,SoA[2]); - cudaThreadSynchronize(); -} +inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){ + sub_symmetry_bd_partF<<>>(ord,func,funcc); + sub_symmetry_bd_partI<<>>(ord,func,funcc,SoA[0]); + sub_symmetry_bd_partJ<<>>(ord,func,funcc,SoA[1]); + sub_symmetry_bd_partK<<>>(ord,func,funcc,SoA[2]); +} __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz) @@ -837,19 +1006,17 @@ __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy __syncthreads(); } -inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA) -{ - sub_symmetry_bd(2,f,fh,SoA); - cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double)); - cudaThreadSynchronize(); - sub_fdderivs_part1<<>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz); - cudaThreadSynchronize(); -} +inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA) +{ + sub_symmetry_bd(2,f,fh,SoA); + cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double)); + sub_fdderivs_part1<<>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz); +} __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz ) { @@ -905,18 +1072,16 @@ __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,d } } -inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA) -{ - sub_symmetry_bd(2,f,fh,SoA); - - cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double)); - cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double)); - - cudaThreadSynchronize(); - sub_fderivs_part1<<>>(f,fh,fx,fy,fz); - cudaThreadSynchronize(); -} +inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA) +{ + sub_symmetry_bd(2,f,fh,SoA); + + cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double)); + cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double)); + + sub_fderivs_part1<<>>(f,fh,fx,fy,fz); +} __global__ void computeRicci_part1(double * dst) { @@ -930,14 +1095,12 @@ __global__ void computeRicci_part1(double * dst) } } - inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta) -{ - sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); - cudaThreadSynchronize(); - computeRicci_part1<<>>(dst); - cudaThreadSynchronize(); - -}/*Exception*/ + inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta) +{ + sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); + computeRicci_part1<<>>(dst); + +}/*Exception*/ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs) { @@ -989,13 +1152,11 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs) } } -inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA) -{ - sub_symmetry_bd(3,f,fh,SoA); - cudaThreadSynchronize(); - sub_kodis_part1<<>>(f,fh,f_rhs); - cudaThreadSynchronize(); -} +inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA) +{ + sub_symmetry_bd(3,f,fh,SoA); + sub_kodis_part1<<>>(f,fh,f_rhs); +} __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz) { @@ -1083,12 +1244,10 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S } -inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){ - sub_symmetry_bd(3,f,fh,SoA); - cudaThreadSynchronize(); - sub_lopsided_part1<<>>(f,fh,f_rhs,Sfx,Sfy,Sfz); - cudaThreadSynchronize(); -} +inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){ + sub_symmetry_bd(3,f,fh,SoA); + sub_lopsided_part1<<>>(f,fh,f_rhs,Sfx,Sfy,Sfz); +} __global__ void compute_rhs_bssn_part1() { @@ -2866,6 +3025,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, cache.last_z = Z; } + reset_buffer_map(cache); + const CopySpec state_copies[] = { {Mh_ chi, chi, static_cast(matrix_size)}, {Mh_ dxx, dxx, static_cast(matrix_size)}, @@ -2892,7 +3053,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, {Mh_ dtSfy, dtSfy, static_cast(matrix_size)}, {Mh_ dtSfz, dtSfz, static_cast(matrix_size)}, }; - if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0]))) + if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0]))) return 1; const ZeroSpec zero_specs[] = { @@ -2977,9 +3138,57 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, }; if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0]))) return 1; - - - double sss[3] = {1,1,1}; + map_buffer(cache, chi, Mh_ chi); + map_buffer(cache, trK, Mh_ trK); + map_buffer(cache, dxx, Mh_ dxx); + map_buffer(cache, gxy, Mh_ gxy); + map_buffer(cache, gxz, Mh_ gxz); + map_buffer(cache, dyy, Mh_ dyy); + map_buffer(cache, gyz, Mh_ gyz); + map_buffer(cache, dzz, Mh_ dzz); + map_buffer(cache, Axx, Mh_ Axx); + map_buffer(cache, Axy, Mh_ Axy); + map_buffer(cache, Axz, Mh_ Axz); + map_buffer(cache, Ayy, Mh_ Ayy); + map_buffer(cache, Ayz, Mh_ Ayz); + map_buffer(cache, Azz, Mh_ Azz); + map_buffer(cache, Gamx, Mh_ Gamx); + map_buffer(cache, Gamy, Mh_ Gamy); + map_buffer(cache, Gamz, Mh_ Gamz); + map_buffer(cache, Lap, Mh_ Lap); + map_buffer(cache, betax, Mh_ betax); + map_buffer(cache, betay, Mh_ betay); + map_buffer(cache, betaz, Mh_ betaz); + map_buffer(cache, dtSfx, Mh_ dtSfx); + map_buffer(cache, dtSfy, Mh_ dtSfy); + map_buffer(cache, dtSfz, Mh_ dtSfz); + map_buffer(cache, chi_rhs, Mh_ chi_rhs); + map_buffer(cache, trK_rhs, Mh_ trK_rhs); + map_buffer(cache, gxx_rhs, Mh_ gxx_rhs); + map_buffer(cache, gxy_rhs, Mh_ gxy_rhs); + map_buffer(cache, gxz_rhs, Mh_ gxz_rhs); + map_buffer(cache, gyy_rhs, Mh_ gyy_rhs); + map_buffer(cache, gyz_rhs, Mh_ gyz_rhs); + map_buffer(cache, gzz_rhs, Mh_ gzz_rhs); + map_buffer(cache, Axx_rhs, Mh_ Axx_rhs); + map_buffer(cache, Axy_rhs, Mh_ Axy_rhs); + map_buffer(cache, Axz_rhs, Mh_ Axz_rhs); + map_buffer(cache, Ayy_rhs, Mh_ Ayy_rhs); + map_buffer(cache, Ayz_rhs, Mh_ Ayz_rhs); + map_buffer(cache, Azz_rhs, Mh_ Azz_rhs); + map_buffer(cache, Gamx_rhs, Mh_ Gamx_rhs); + map_buffer(cache, Gamy_rhs, Mh_ Gamy_rhs); + map_buffer(cache, Gamz_rhs, Mh_ Gamz_rhs); + map_buffer(cache, Lap_rhs, Mh_ Lap_rhs); + map_buffer(cache, betax_rhs, Mh_ betax_rhs); + map_buffer(cache, betay_rhs, Mh_ betay_rhs); + map_buffer(cache, betaz_rhs, Mh_ betaz_rhs); + map_buffer(cache, dtSfx_rhs, Mh_ dtSfx_rhs); + map_buffer(cache, dtSfy_rhs, Mh_ dtSfy_rhs); + map_buffer(cache, dtSfz_rhs, Mh_ dtSfz_rhs); + + + double sss[3] = {1,1,1}; double aas[3] = {-1,-1,1}; double asa[3] = {-1,1,-1}; double saa[3] = {1,-1,-1}; @@ -3116,9 +3325,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, #endif //cout<<"GPU meta data ready.\n"; - cudaThreadSynchronize(); - -//--------------test constant memory address & value-------------- + //--------------test constant memory address & value-------------- /* double rank = mpi_rank; cudaMemcpyToSymbol(F1o3,&rank, sizeof(double)); double ctest1 = -1; @@ -3137,11 +3344,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, //#4-----------------------calculate------------------------------ //4.0------enforce_ga--------- //sub_enforce_ga(matrix_size); - //4.1-----compute rhs--------- - compute_rhs_bssn_part1<<>>(); - cudaThreadSynchronize(); - - sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); + //4.1-----compute rhs--------- + compute_rhs_bssn_part1<<>>(); + + sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas); sub_fderivs(Mh_ betaz,Mh_ fh,Mh_ betazx,Mh_ betazy,Mh_ betazz,ssa); sub_fderivs(Mh_ chi,Mh_ fh,Mh_ chix,Mh_ chiy,Mh_ chiz, sss); @@ -3153,44 +3359,37 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, sub_fderivs(Mh_ gxy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz, aas); sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa); sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa); - - compute_rhs_bssn_part2<<>>(); - cudaThreadSynchronize(); - - sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); + + compute_rhs_bssn_part2<<>>(); + + sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas); sub_fdderivs(Mh_ betaz,Mh_ fh,Mh_ gxxz,Mh_ gxyz,Mh_ gxzz,Mh_ gyyz,Mh_ gyzz,Mh_ gzzz,ssa); sub_fderivs( Mh_ Gamx, Mh_ fh,Mh_ Gamxx, Mh_ Gamxy, Mh_ Gamxz,ass); sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas); sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa); - - compute_rhs_bssn_part3<<>>(); - cudaThreadSynchronize(); - - computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta); + + compute_rhs_bssn_part3<<>>(); + + computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta); computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta); computeRicci(Mh_ dzz,Mh_ Rzz,sss, meta); computeRicci(Mh_ gxy,Mh_ Rxy,aas, meta); computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta); computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta); - cudaThreadSynchronize(); - - compute_rhs_bssn_part4<<>>(); - cudaThreadSynchronize(); - - sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - - compute_rhs_bssn_part5<<>>(); - cudaThreadSynchronize(); - - sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - - compute_rhs_bssn_part6<<>>(); - cudaThreadSynchronize(); - -#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) - sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); + compute_rhs_bssn_part4<<>>(); + + sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); + + compute_rhs_bssn_part5<<>>(); + + sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); + + compute_rhs_bssn_part6<<>>(); + +#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) + sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); compute_rhs_bssn_part6_gauge<<>>(); #endif @@ -3257,19 +3456,17 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, } - if(co == 0){ - compute_rhs_bssn_part7<<>>(); - cudaThreadSynchronize(); - - sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); + if(co == 0){ + compute_rhs_bssn_part7<<>>(); + + sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas); sub_fderivs(Mh_ Axz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz,asa); sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss); - sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); - sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); - compute_rhs_bssn_part8<<>>(); - cudaThreadSynchronize(); - } + sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); + sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); + compute_rhs_bssn_part8<<>>(); + } #if (ABV == 1) cout<<"TODO: bssn_gpu.cu::2373 (ABV == 1)"<