Replace MPI_Allreduce with owner-rank MPI_Bcast in Patch::Interp_Points

The two MPI_Allreduce calls (data + weight) were the #1 hotspot at 38.5%
CPU time. Since all ranks traverse the same block list and agree on point
ownership, we replace the global reduction with targeted MPI_Bcast from
each owner rank. This also eliminates the weight array/Allreduce entirely,
removes redundant heap allocations (shellf, weight, DH, llb, uub), and
writes interpolation results directly into the output buffer.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-09 22:39:18 +08:00
parent 738498cb28
commit 50e2a845f8

View File

@@ -341,8 +341,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
double *Shellf, int Symmetry)
{
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine
int myrank;
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int ordn = 2 * ghost_width;
MyList<var> *varl;
@@ -354,24 +355,18 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next;
}
double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
memset(Shellf, 0, sizeof(double) * NN * num_var);
// we use weight to monitor code, later some day we can move it for optimization
int *weight;
weight = new int[NN];
memset(weight, 0, sizeof(int) * NN);
double *DH, *llb, *uub;
DH = new double[dim];
// owner_rank[j] records which MPI rank owns point j
// All ranks traverse the same block list so they all agree on ownership
int *owner_rank;
owner_rank = new int[NN];
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points
{
@@ -403,12 +398,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
bool flag = true;
for (int i = 0; i < dim; i++)
{
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
@@ -433,6 +422,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
@@ -440,14 +430,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0;
while (varl) // run along variables
{
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
weight[j] = 1;
}
}
if (Bp == ble)
@@ -456,103 +443,61 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
int *Weight;
Weight = new int[NN];
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
// misc::tillherecheck("print me");
for (int i = 0; i < NN; i++)
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group.
// Since each point's data is non-zero only on the owner rank,
// Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
{
if (Weight[i] > 1)
int j = 0;
while (j < NN)
{
int cur_owner = owner_rank[j];
if (cur_owner < 0)
{
if (myrank == 0)
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
for (int j = 0; j < num_var; j++)
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
}
else if (Weight[i] == 0 && myrank == 0)
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int j = 0; j < dim; j++)
for (int d = 0; d < dim; d++)
{
cout << XX[j][i];
if (j < dim - 1)
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int j = 0; j < dim; j++)
for (int d = 0; d < dim; d++)
{
cout << bbox[j] << "+" << lli[j] * getdX(j);
if (j < dim - 1)
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int j = 0; j < dim; j++)
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
if (j < dim - 1)
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
#if 0
checkBlock();
#else
cout << "splited domains:" << endl;
{
MyList<Block> *Bp = blb;
while (Bp)
{
Block *BP = Bp->data;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout << "(";
for (int j = 0; j < dim; j++)
{
cout << llb[j] << ":" << uub[j];
if (j < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD, 1);
}
j++;
continue;
}
// Find contiguous run of points with the same owner
int jstart = j;
while (j < NN && owner_rank[j] == cur_owner)
j++;
int count = (j - jstart) * num_var;
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
}
}
delete[] shellf;
delete[] weight;
delete[] Weight;
delete[] DH;
delete[] llb;
delete[] uub;
delete[] owner_rank;
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,
@@ -573,24 +518,22 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next;
}
double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
memset(Shellf, 0, sizeof(double) * NN * num_var);
// we use weight to monitor code, later some day we can move it for optimization
int *weight;
weight = new int[NN];
memset(weight, 0, sizeof(int) * NN);
// owner_rank[j] stores the global rank that owns point j
int *owner_rank;
owner_rank = new int[NN];
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double *DH, *llb, *uub;
DH = new double[dim];
// Build global-to-local rank translation for Comm_here
MPI_Group world_group, local_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group);
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points
{
@@ -622,12 +565,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
bool flag = true;
for (int i = 0; i < dim; i++)
{
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
@@ -652,6 +589,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
@@ -659,14 +597,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0;
while (varl) // run along variables
{
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
weight[j] = 1;
}
}
if (Bp == ble)
@@ -675,97 +610,35 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
int *Weight;
Weight = new int[NN];
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
// Collect unique global owner ranks and translate to local ranks in Comm_here
// Then broadcast each owner's points via MPI_Bcast on Comm_here
{
int j = 0;
while (j < NN)
{
int cur_owner_global = owner_rank[j];
if (cur_owner_global < 0)
{
// Point not found — skip (error check disabled for sub-communicator levels)
j++;
continue;
}
// Translate global rank to local rank in Comm_here
int cur_owner_local;
MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
// misc::tillherecheck("print me");
// if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
for (int i = 0; i < NN; i++)
{
if (Weight[i] > 1)
{
if (lmyrank == 0)
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
for (int j = 0; j < num_var; j++)
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
// Find contiguous run of points with the same owner
int jstart = j;
while (j < NN && owner_rank[j] == cur_owner_global)
j++;
int count = (j - jstart) * num_var;
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
}
#if 0 // for not involved levels, this may fail
else if(Weight[i] == 0 && lmyrank == 0)
{
cout<<"ERROR: Patch::Interp_Points fails to find point (";
for(int j=0;j<dim;j++)
{
cout<<XX[j][i];
if(j<dim-1) cout<<",";
else cout<<")";
}
cout<<" on Patch (";
for(int j=0;j<dim;j++)
{
cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")--";
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
#if 0
checkBlock();
#else
cout<<"splited domains:"<<endl;
{
MyList<Block> *Bp=blb;
while(Bp)
{
Block *BP=Bp->data;
for(int i=0;i<dim;i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +(ghost_width-0.5)*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +ghost_width*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<llb[j]<<":"<<uub[j];
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
if(Bp == ble) break;
Bp=Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD,1);
}
#endif
}
delete[] shellf;
delete[] weight;
delete[] Weight;
delete[] DH;
delete[] llb;
delete[] uub;
MPI_Group_free(&world_group);
MPI_Group_free(&local_group);
delete[] owner_rank;
}
void Patch::checkBlock()
{