Cache data_packer lengths in Sync_start to skip redundant buffer-size traversals

The data_packer(NULL, ...) calls that compute send/recv buffer lengths
traverse all grid segments × variables × nprocs on every Sync_start
invocation, even though lengths never change once the cache is built.
Add a lengths_valid flag to SyncCache so these length computations are
done once and reused on subsequent calls (4× per RK4 step).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-02-10 21:39:22 +08:00
parent d06d5b4db8
commit e09ae438a2
2 changed files with 26 additions and 7 deletions

View File

@@ -3853,7 +3853,8 @@ void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
Parallel::SyncCache::SyncCache() Parallel::SyncCache::SyncCache()
: valid(false), cpusize(0), combined_src(0), combined_dst(0), : valid(false), cpusize(0), combined_src(0), combined_dst(0),
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0) send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
lengths_valid(false)
{ {
} }
// SyncCache invalidate: free grid segment lists but keep buffers // SyncCache invalidate: free grid segment lists but keep buffers
@@ -3871,6 +3872,7 @@ void Parallel::SyncCache::invalidate()
send_lengths[i] = recv_lengths[i] = 0; send_lengths[i] = recv_lengths[i] = 0;
} }
valid = false; valid = false;
lengths_valid = false;
} }
// SyncCache destroy: free everything // SyncCache destroy: free everything
void Parallel::SyncCache::destroy() void Parallel::SyncCache::destroy()
@@ -4172,8 +4174,13 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
{ {
if (node == myrank) if (node == myrank)
{ {
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); int length;
cache.recv_lengths[node] = length; if (!cache.lengths_valid) {
length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
cache.recv_lengths[node] = length;
} else {
length = cache.recv_lengths[node];
}
if (length > 0) if (length > 0)
{ {
if (length > cache.recv_buf_caps[node]) if (length > cache.recv_buf_caps[node])
@@ -4187,8 +4194,13 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
} }
else else
{ {
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); int slength;
cache.send_lengths[node] = slength; if (!cache.lengths_valid) {
slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
cache.send_lengths[node] = slength;
} else {
slength = cache.send_lengths[node];
}
if (slength > 0) if (slength > 0)
{ {
if (slength > cache.send_buf_caps[node]) if (slength > cache.send_buf_caps[node])
@@ -4200,8 +4212,13 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
} }
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); int rlength;
cache.recv_lengths[node] = rlength; if (!cache.lengths_valid) {
rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
cache.recv_lengths[node] = rlength;
} else {
rlength = cache.recv_lengths[node];
}
if (rlength > 0) if (rlength > 0)
{ {
if (rlength > cache.recv_buf_caps[node]) if (rlength > cache.recv_buf_caps[node])
@@ -4214,6 +4231,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
} }
} }
} }
cache.lengths_valid = true;
} }
// Sync_finish: wait for async MPI operations and unpack // Sync_finish: wait for async MPI operations and unpack
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,

View File

@@ -97,6 +97,7 @@ namespace Parallel
MPI_Request *reqs; MPI_Request *reqs;
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();