From 4012e9d0686e3941b1ae90c56b9b38c2cf551926 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 20:48:38 +0800 Subject: [PATCH] =?UTF-8?q?perf(RestrictProlong):=20=E7=94=A8=20Restrict?= =?UTF-8?q?=5Fcached/OutBdLow2Hi=5Fcached=20=E6=9B=BF=E6=8D=A2=E9=9D=9E?= =?UTF-8?q?=E7=BC=93=E5=AD=98=E7=89=88=E6=9C=AC=EF=BC=8CSync=5Ffinish=20?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E6=B8=90=E8=BF=9B=E5=BC=8F=E8=A7=A3=E5=8C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - RestrictProlong/RestrictProlong_aux 中的 Restrict() 和 OutBdLow2Hi() 替换为 _cached 版本, 复用 gridseg 列表和 MPI 缓冲区,避免每次调用重新分配 - 新增 sync_cache_restrict/sync_cache_outbd 两组 per-level 缓存 - Sync_finish 从 MPI_Waitall 改为 MPI_Waitsome 渐进式解包,降低尾延迟 - AsyncSyncState 扩展 req_node/req_is_recv/pending_recv 字段支持渐进解包 Co-Authored-By: Claude Opus 4.6 (1M context) --- AMSS_NCKU_source/Parallel.C | 959 ++++++++++++++++++---------------- AMSS_NCKU_source/Parallel.h | 5 +- AMSS_NCKU_source/bssn_class.C | 44 +- AMSS_NCKU_source/bssn_class.h | 2 + 4 files changed, 527 insertions(+), 483 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index e3bb4a3..4e5e4ec 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3883,263 +3883,263 @@ int Parallel::data_packermix(double *data, MyList *src, MyLis return size_out; } // -void Parallel::transfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} +void Parallel::transfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} // -void Parallel::transfermix(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} +void Parallel::transfermix(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) { int cpusize; @@ -4367,111 +4367,110 @@ void Parallel::SyncCache::destroy() cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache -void Parallel::transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - int myrank; +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - int pending_recv = 0; - int node; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[myrank] = self_len; - if (self_len > 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; -} -// Sync_cached: build grid segment lists on first call, reuse on subsequent calls + int req_no = 0; + int pending_recv = 0; + int node; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { if (!cache.valid) @@ -4669,6 +4668,11 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr int cpusize = cache.cpusize; state.req_no = 0; state.active = true; + state.pending_recv = 0; + // Allocate tracking arrays + delete[] state.req_node; delete[] state.req_is_recv; + state.req_node = new int[cache.max_reqs]; + state.req_is_recv = new int[cache.max_reqs]; MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; @@ -4713,6 +4717,8 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.send_buf_caps[node] = slength; } data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 0; MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); } int rlength; @@ -4730,29 +4736,60 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.recv_bufs[node] = new double[rlength]; cache.recv_buf_caps[node] = rlength; } + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 1; + state.pending_recv++; MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); } } } cache.lengths_valid = true; } -// Sync_finish: wait for async MPI operations and unpack +// Sync_finish: progressive unpack as receives complete, then wait for sends void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, MyList *VarList, int Symmetry) { if (!state.active) return; - MPI_Waitall(state.req_no, cache.reqs, cache.stats); - - int cpusize = cache.cpusize; + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; - for (int node = 0; node < cpusize; node++) - if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) - data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + // Unpack local data first (no MPI needed) + if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + // Progressive unpack of remote receives + if (state.pending_recv > 0 && state.req_no > 0) + { + int pending = state.pending_recv; + int *completed = new int[cache.max_reqs]; + while (pending > 0) + { + int outcount = 0; + MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && state.req_is_recv[idx]) + { + int recv_node = state.req_node[idx]; + data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + pending--; + } + } + } + delete[] completed; + } + + // Wait for remaining sends + if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats); + + delete[] state.req_node; state.req_node = 0; + delete[] state.req_is_recv; state.req_is_recv = 0; state.active = false; } // collect buffer grid segments or blocks for the periodic boundary condition of given patch @@ -5883,9 +5920,9 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, } // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking -void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) { if (!cache.valid) { @@ -5931,100 +5968,100 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - int pending_recv = 0; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; - - // Post receives first so peers can progress rendezvous early. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[myrank] = self_len; - if (self_len > 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; -} + int req_no = 0; + int pending_recv = 0; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} // collect all buffer grid segments or blocks for given patch MyList *Parallel::build_buffer_gsl(Patch *Pat) diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index e17f365..5a72797 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -121,7 +121,10 @@ namespace Parallel struct AsyncSyncState { int req_no; bool active; - AsyncSyncState() : req_no(0), active(false) {} + int *req_node; + int *req_is_recv; + int pending_recv; + AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {} }; void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index eb84f8e..432747e 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -736,6 +736,8 @@ void bssn_class::Initialize() sync_cache_cor = new Parallel::SyncCache[GH->levels]; sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; } //================================================================================================ @@ -2213,7 +2215,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2429,7 +2431,7 @@ void bssn_class::RecursiveStep(int lev) if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -2608,7 +2610,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -2775,7 +2777,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2790,7 +2792,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2809,7 +2811,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2825,7 +2827,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -5796,7 +5798,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); @@ -5820,7 +5822,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif @@ -5847,7 +5849,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); @@ -5871,7 +5873,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif @@ -5940,7 +5942,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, } #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); @@ -5950,7 +5952,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif @@ -5962,7 +5964,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, else // no time refinement levels and for all same time levels { #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); @@ -5972,7 +5974,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif @@ -6027,7 +6029,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) } #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); @@ -6037,7 +6039,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif @@ -6051,7 +6053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) if (myrank == 0) cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); @@ -6061,7 +6063,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif @@ -6102,7 +6104,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif @@ -6115,7 +6117,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) { #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index db434e2..5a8eb2a 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -130,6 +130,8 @@ public: Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] + Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong + Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ConVMonitor;