Add SyncBegin/SyncEnd to Parallel for MPI communication-computation overlap
Split the blocking Parallel::Sync into async SyncBegin (initiates local copy + MPI_Isend/Irecv) and SyncEnd (MPI_Waitall + unpack). This allows overlapping MPI ghost zone exchange with error checking and Shell patch computation. Modified Step() in bssn_class.C for both PSTR==0 and PSTR==1/2/3 versions to start Sync before error checks, overlapping the MPI_Allreduce with the ongoing ghost zone transfers. Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>
This commit is contained in:
@@ -3756,6 +3756,231 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
delete[] transfer_src;
|
||||
delete[] transfer_dst;
|
||||
}
|
||||
//
|
||||
// Async Sync: split into SyncBegin (initiate MPI) and SyncEnd (wait + unpack)
|
||||
// This allows overlapping MPI communication with computation.
|
||||
//
|
||||
static void transfer_begin(Parallel::TransferState *ts)
|
||||
{
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = ts->cpusize;
|
||||
|
||||
ts->reqs = new MPI_Request[2 * cpusize];
|
||||
ts->stats = new MPI_Status[2 * cpusize];
|
||||
ts->req_no = 0;
|
||||
ts->send_data = new double *[cpusize];
|
||||
ts->rec_data = new double *[cpusize];
|
||||
int length;
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
ts->send_data[node] = ts->rec_data[node] = 0;
|
||||
if (node == myrank)
|
||||
{
|
||||
// Local copy: pack then immediately unpack (no MPI needed)
|
||||
if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
|
||||
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
|
||||
{
|
||||
double *local_data = new double[length];
|
||||
if (!local_data)
|
||||
{
|
||||
cout << "out of memory in transfer_begin, local copy" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
Parallel::data_packer(local_data, ts->transfer_src[myrank], ts->transfer_dst[myrank],
|
||||
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
|
||||
Parallel::data_packer(local_data, ts->transfer_src[node], ts->transfer_dst[node],
|
||||
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
|
||||
delete[] local_data;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// send from this cpu to cpu#node
|
||||
if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
|
||||
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
|
||||
{
|
||||
ts->send_data[node] = new double[length];
|
||||
if (!ts->send_data[node])
|
||||
{
|
||||
cout << "out of memory in transfer_begin, send" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
Parallel::data_packer(ts->send_data[node], ts->transfer_src[myrank], ts->transfer_dst[myrank],
|
||||
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
|
||||
MPI_Isend((void *)ts->send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
|
||||
ts->reqs + ts->req_no++);
|
||||
}
|
||||
// receive from cpu#node to this cpu
|
||||
if ((length = Parallel::data_packer(0, ts->transfer_src[node], ts->transfer_dst[node],
|
||||
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
|
||||
{
|
||||
ts->rec_data[node] = new double[length];
|
||||
if (!ts->rec_data[node])
|
||||
{
|
||||
cout << "out of memory in transfer_begin, recv" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)ts->rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
|
||||
ts->reqs + ts->req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
// NOTE: MPI_Waitall is NOT called here - that happens in transfer_end
|
||||
}
|
||||
//
|
||||
static void transfer_end(Parallel::TransferState *ts)
|
||||
{
|
||||
// Wait for all pending MPI operations
|
||||
MPI_Waitall(ts->req_no, ts->reqs, ts->stats);
|
||||
|
||||
// Unpack received data from remote ranks
|
||||
for (int node = 0; node < ts->cpusize; node++)
|
||||
if (ts->rec_data[node])
|
||||
Parallel::data_packer(ts->rec_data[node], ts->transfer_src[node], ts->transfer_dst[node],
|
||||
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
|
||||
|
||||
// Cleanup MPI buffers
|
||||
for (int node = 0; node < ts->cpusize; node++)
|
||||
{
|
||||
if (ts->send_data[node])
|
||||
delete[] ts->send_data[node];
|
||||
if (ts->rec_data[node])
|
||||
delete[] ts->rec_data[node];
|
||||
}
|
||||
delete[] ts->reqs;
|
||||
delete[] ts->stats;
|
||||
delete[] ts->send_data;
|
||||
delete[] ts->rec_data;
|
||||
}
|
||||
//
|
||||
Parallel::SyncHandle *Parallel::SyncBegin(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
|
||||
SyncHandle *handle = new SyncHandle;
|
||||
handle->num_states = 1;
|
||||
handle->states = new TransferState[1];
|
||||
|
||||
TransferState *ts = &handle->states[0];
|
||||
ts->cpusize = cpusize;
|
||||
ts->VarList1 = VarList;
|
||||
ts->VarList2 = VarList;
|
||||
ts->Symmetry = Symmetry;
|
||||
|
||||
ts->dst = build_ghost_gsl(Pat);
|
||||
ts->src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
ts->src[node] = build_owned_gsl0(Pat, node);
|
||||
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
|
||||
}
|
||||
|
||||
transfer_begin(ts);
|
||||
|
||||
return handle;
|
||||
}
|
||||
//
|
||||
Parallel::SyncHandle *Parallel::SyncBegin(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
|
||||
// Count patches
|
||||
int num_patches = 0;
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp) { num_patches++; Pp = Pp->next; }
|
||||
|
||||
SyncHandle *handle = new SyncHandle;
|
||||
handle->num_states = num_patches + 1; // intra-patch transfers + 1 inter-patch transfer
|
||||
handle->states = new TransferState[handle->num_states];
|
||||
|
||||
// Intra-patch sync: for each patch, build ghost lists and initiate transfer
|
||||
int idx = 0;
|
||||
Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
TransferState *ts = &handle->states[idx];
|
||||
ts->cpusize = cpusize;
|
||||
ts->VarList1 = VarList;
|
||||
ts->VarList2 = VarList;
|
||||
ts->Symmetry = Symmetry;
|
||||
|
||||
ts->dst = build_ghost_gsl(Pp->data);
|
||||
ts->src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
ts->src[node] = build_owned_gsl0(Pp->data, node);
|
||||
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
|
||||
}
|
||||
|
||||
transfer_begin(ts);
|
||||
|
||||
idx++;
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
// Inter-patch sync: buffer zone exchange between patches
|
||||
{
|
||||
TransferState *ts = &handle->states[idx];
|
||||
ts->cpusize = cpusize;
|
||||
ts->VarList1 = VarList;
|
||||
ts->VarList2 = VarList;
|
||||
ts->Symmetry = Symmetry;
|
||||
|
||||
ts->dst = build_buffer_gsl(PatL);
|
||||
ts->src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
ts->src[node] = build_owned_gsl(PatL, node, 5, Symmetry);
|
||||
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
|
||||
}
|
||||
|
||||
transfer_begin(ts);
|
||||
}
|
||||
|
||||
return handle;
|
||||
}
|
||||
//
|
||||
void Parallel::SyncEnd(SyncHandle *handle)
|
||||
{
|
||||
if (!handle)
|
||||
return;
|
||||
|
||||
// Wait for all pending transfers and unpack
|
||||
for (int i = 0; i < handle->num_states; i++)
|
||||
{
|
||||
TransferState *ts = &handle->states[i];
|
||||
transfer_end(ts);
|
||||
|
||||
// Cleanup grid segment lists
|
||||
if (ts->dst)
|
||||
ts->dst->destroyList();
|
||||
for (int node = 0; node < ts->cpusize; node++)
|
||||
{
|
||||
if (ts->src[node])
|
||||
ts->src[node]->destroyList();
|
||||
if (ts->transfer_src[node])
|
||||
ts->transfer_src[node]->destroyList();
|
||||
if (ts->transfer_dst[node])
|
||||
ts->transfer_dst[node]->destroyList();
|
||||
}
|
||||
delete[] ts->src;
|
||||
delete[] ts->transfer_src;
|
||||
delete[] ts->transfer_dst;
|
||||
}
|
||||
|
||||
delete[] handle->states;
|
||||
delete handle;
|
||||
}
|
||||
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
|
||||
// ---------------------------------------------------
|
||||
// |con | |con |
|
||||
|
||||
Reference in New Issue
Block a user