perf(transfer_cached): 将 per-call new/delete 的 req_node/req_is_recv/completed 数组移入 SyncCache 复用
避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4320,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
|
|||||||
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
||||||
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
||||||
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
||||||
lengths_valid(false)
|
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
// SyncCache invalidate: free grid segment lists but keep buffers
|
// SyncCache invalidate: free grid segment lists but keep buffers
|
||||||
@@ -4359,11 +4359,15 @@ void Parallel::SyncCache::destroy()
|
|||||||
if (recv_bufs) delete[] recv_bufs;
|
if (recv_bufs) delete[] recv_bufs;
|
||||||
if (reqs) delete[] reqs;
|
if (reqs) delete[] reqs;
|
||||||
if (stats) delete[] stats;
|
if (stats) delete[] stats;
|
||||||
|
if (tc_req_node) delete[] tc_req_node;
|
||||||
|
if (tc_req_is_recv) delete[] tc_req_is_recv;
|
||||||
|
if (tc_completed) delete[] tc_completed;
|
||||||
combined_src = combined_dst = 0;
|
combined_src = combined_dst = 0;
|
||||||
send_lengths = recv_lengths = 0;
|
send_lengths = recv_lengths = 0;
|
||||||
send_buf_caps = recv_buf_caps = 0;
|
send_buf_caps = recv_buf_caps = 0;
|
||||||
send_bufs = recv_bufs = 0;
|
send_bufs = recv_bufs = 0;
|
||||||
reqs = 0; stats = 0;
|
reqs = 0; stats = 0;
|
||||||
|
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
|
||||||
cpusize = 0; max_reqs = 0;
|
cpusize = 0; max_reqs = 0;
|
||||||
}
|
}
|
||||||
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
||||||
@@ -4379,9 +4383,9 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
int req_no = 0;
|
int req_no = 0;
|
||||||
int pending_recv = 0;
|
int pending_recv = 0;
|
||||||
int node;
|
int node;
|
||||||
int *req_node = new int[cache.max_reqs];
|
int *req_node = cache.tc_req_node;
|
||||||
int *req_is_recv = new int[cache.max_reqs];
|
int *req_is_recv = cache.tc_req_is_recv;
|
||||||
int *completed = new int[cache.max_reqs];
|
int *completed = cache.tc_completed;
|
||||||
|
|
||||||
// Post receives first so peers can progress rendezvous early.
|
// Post receives first so peers can progress rendezvous early.
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
@@ -4466,10 +4470,6 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
|
|
||||||
if (self_len > 0)
|
if (self_len > 0)
|
||||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
|
||||||
delete[] req_node;
|
|
||||||
delete[] req_is_recv;
|
|
||||||
delete[] completed;
|
|
||||||
}
|
}
|
||||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||||
{
|
{
|
||||||
@@ -4498,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
@@ -4598,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
@@ -5856,6 +5862,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
|
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
|
||||||
@@ -5902,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||||
@@ -5948,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||||
|
|||||||
@@ -108,6 +108,9 @@ namespace Parallel
|
|||||||
MPI_Status *stats;
|
MPI_Status *stats;
|
||||||
int max_reqs;
|
int max_reqs;
|
||||||
bool lengths_valid;
|
bool lengths_valid;
|
||||||
|
int *tc_req_node;
|
||||||
|
int *tc_req_is_recv;
|
||||||
|
int *tc_completed;
|
||||||
SyncCache();
|
SyncCache();
|
||||||
void invalidate();
|
void invalidate();
|
||||||
void destroy();
|
void destroy();
|
||||||
|
|||||||
Reference in New Issue
Block a user