perf(transfer_cached): 将 per-call new/delete 的 req_node/req_is_recv/completed 数组移入 SyncCache 复用
避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4320,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
|
||||
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
||||
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
||||
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
||||
lengths_valid(false)
|
||||
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
|
||||
{
|
||||
}
|
||||
// SyncCache invalidate: free grid segment lists but keep buffers
|
||||
@@ -4359,11 +4359,15 @@ void Parallel::SyncCache::destroy()
|
||||
if (recv_bufs) delete[] recv_bufs;
|
||||
if (reqs) delete[] reqs;
|
||||
if (stats) delete[] stats;
|
||||
if (tc_req_node) delete[] tc_req_node;
|
||||
if (tc_req_is_recv) delete[] tc_req_is_recv;
|
||||
if (tc_completed) delete[] tc_completed;
|
||||
combined_src = combined_dst = 0;
|
||||
send_lengths = recv_lengths = 0;
|
||||
send_buf_caps = recv_buf_caps = 0;
|
||||
send_bufs = recv_bufs = 0;
|
||||
reqs = 0; stats = 0;
|
||||
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
|
||||
cpusize = 0; max_reqs = 0;
|
||||
}
|
||||
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
||||
@@ -4379,9 +4383,9 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int node;
|
||||
int *req_node = new int[cache.max_reqs];
|
||||
int *req_is_recv = new int[cache.max_reqs];
|
||||
int *completed = new int[cache.max_reqs];
|
||||
int *req_node = cache.tc_req_node;
|
||||
int *req_is_recv = cache.tc_req_is_recv;
|
||||
int *completed = cache.tc_completed;
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
@@ -4466,10 +4470,6 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
|
||||
if (self_len > 0)
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
}
|
||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||
{
|
||||
@@ -4498,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
||||
cache.max_reqs = 2 * cpusize;
|
||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||
cache.stats = new MPI_Status[cache.max_reqs];
|
||||
cache.tc_req_node = new int[cache.max_reqs];
|
||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||
cache.tc_completed = new int[cache.max_reqs];
|
||||
}
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
@@ -4598,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
||||
cache.max_reqs = 2 * cpusize;
|
||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||
cache.stats = new MPI_Status[cache.max_reqs];
|
||||
cache.tc_req_node = new int[cache.max_reqs];
|
||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||
cache.tc_completed = new int[cache.max_reqs];
|
||||
}
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
@@ -5856,6 +5862,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
cache.max_reqs = 2 * cpusize;
|
||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||
cache.stats = new MPI_Status[cache.max_reqs];
|
||||
cache.tc_req_node = new int[cache.max_reqs];
|
||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||
cache.tc_completed = new int[cache.max_reqs];
|
||||
}
|
||||
|
||||
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
|
||||
@@ -5902,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
cache.max_reqs = 2 * cpusize;
|
||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||
cache.stats = new MPI_Status[cache.max_reqs];
|
||||
cache.tc_req_node = new int[cache.max_reqs];
|
||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||
cache.tc_completed = new int[cache.max_reqs];
|
||||
}
|
||||
|
||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||
@@ -5948,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
cache.max_reqs = 2 * cpusize;
|
||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||
cache.stats = new MPI_Status[cache.max_reqs];
|
||||
cache.tc_req_node = new int[cache.max_reqs];
|
||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||
cache.tc_completed = new int[cache.max_reqs];
|
||||
}
|
||||
|
||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||
|
||||
@@ -108,6 +108,9 @@ namespace Parallel
|
||||
MPI_Status *stats;
|
||||
int max_reqs;
|
||||
bool lengths_valid;
|
||||
int *tc_req_node;
|
||||
int *tc_req_is_recv;
|
||||
int *tc_completed;
|
||||
SyncCache();
|
||||
void invalidate();
|
||||
void destroy();
|
||||
|
||||
Reference in New Issue
Block a user