From 5070134857bed9940c9ab0a3cb2b51c8506c6358 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 21:14:35 +0800 Subject: [PATCH] =?UTF-8?q?perf(transfer=5Fcached):=20=E5=B0=86=20per-call?= =?UTF-8?q?=20new/delete=20=E7=9A=84=20req=5Fnode/req=5Fis=5Frecv/complete?= =?UTF-8?q?d=20=E6=95=B0=E7=BB=84=E7=A7=BB=E5=85=A5=20SyncCache=20?= =?UTF-8?q?=E5=A4=8D=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。 Co-Authored-By: Claude Opus 4.6 (1M context) --- AMSS_NCKU_source/Parallel.C | 31 +++++++++++++++++++++++-------- AMSS_NCKU_source/Parallel.h | 3 +++ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 4e5e4ec..b87ce6d 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -4320,7 +4320,7 @@ Parallel::SyncCache::SyncCache() : valid(false), cpusize(0), combined_src(0), combined_dst(0), send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0), - lengths_valid(false) + lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0) { } // SyncCache invalidate: free grid segment lists but keep buffers @@ -4359,11 +4359,15 @@ void Parallel::SyncCache::destroy() if (recv_bufs) delete[] recv_bufs; if (reqs) delete[] reqs; if (stats) delete[] stats; + if (tc_req_node) delete[] tc_req_node; + if (tc_req_is_recv) delete[] tc_req_is_recv; + if (tc_completed) delete[] tc_completed; combined_src = combined_dst = 0; send_lengths = recv_lengths = 0; send_buf_caps = recv_buf_caps = 0; send_bufs = recv_bufs = 0; reqs = 0; stats = 0; + tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0; cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache @@ -4379,9 +4383,9 @@ void Parallel::transfer_cached(MyList **src, MyList **src, MyList 0) data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; } void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { @@ -4498,6 +4498,9 @@ void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmet cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } for (int node = 0; node < cpusize; node++) @@ -4598,6 +4601,9 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } for (int node = 0; node < cpusize; node++) @@ -5856,6 +5862,9 @@ void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_complete_gsl(PatcL); @@ -5902,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_buffer_gsl(PatfL); @@ -5948,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_buffer_gsl(PatfL); diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 5a72797..0ab975c 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -108,6 +108,9 @@ namespace Parallel MPI_Status *stats; int max_reqs; bool lengths_valid; + int *tc_req_node; + int *tc_req_is_recv; + int *tc_completed; SyncCache(); void invalidate(); void destroy();