Compare commits
17 Commits
yx-prolong
...
chb-parall
| Author | SHA1 | Date | |
|---|---|---|---|
|
9c44d1c885
|
|||
|
4b9de28feb
|
|||
|
4eb5dc4ddb
|
|||
| 688bdb6708 | |||
| 5070134857 | |||
| 4012e9d068 | |||
| b3c367f15b | |||
| e73911f292 | |||
| 7543d3e8c7 | |||
| 42c69fab24 | |||
| 95220a05c8 | |||
| 466b084a58 | |||
| 61ccef9f97 | |||
| e11363e06e | |||
| f70e90f694 | |||
|
|
75dd5353b0 | ||
|
|
23a82d063b |
@@ -7,6 +7,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <new>
|
#include <new>
|
||||||
|
#include <vector>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
@@ -17,6 +18,168 @@ using namespace std;
|
|||||||
#include "interp_lb_profile.h"
|
#include "interp_lb_profile.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
struct InterpBlockView
|
||||||
|
{
|
||||||
|
Block *bp;
|
||||||
|
double llb[dim];
|
||||||
|
double uub[dim];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BlockBinIndex
|
||||||
|
{
|
||||||
|
int bins[dim];
|
||||||
|
double lo[dim];
|
||||||
|
double inv[dim];
|
||||||
|
vector<InterpBlockView> views;
|
||||||
|
vector<vector<int>> bin_to_blocks;
|
||||||
|
bool valid;
|
||||||
|
|
||||||
|
BlockBinIndex() : valid(false)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
|
{
|
||||||
|
bins[i] = 1;
|
||||||
|
lo[i] = 0.0;
|
||||||
|
inv[i] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
inline int clamp_int(int v, int lo, int hi)
|
||||||
|
{
|
||||||
|
return (v < lo) ? lo : ((v > hi) ? hi : v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int coord_to_bin(double x, double lo, double inv, int nb)
|
||||||
|
{
|
||||||
|
if (nb <= 1 || inv <= 0.0)
|
||||||
|
return 0;
|
||||||
|
int b = int(floor((x - lo) * inv));
|
||||||
|
return clamp_int(b, 0, nb - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
|
||||||
|
{
|
||||||
|
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
|
{
|
||||||
|
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
|
||||||
|
{
|
||||||
|
index = BlockBinIndex();
|
||||||
|
|
||||||
|
MyList<Block> *Bp = patch->blb;
|
||||||
|
while (Bp)
|
||||||
|
{
|
||||||
|
Block *BP = Bp->data;
|
||||||
|
InterpBlockView view;
|
||||||
|
view.bp = BP;
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
|
{
|
||||||
|
#ifdef Vertex
|
||||||
|
#ifdef Cell
|
||||||
|
#error Both Cell and Vertex are defined
|
||||||
|
#endif
|
||||||
|
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||||
|
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||||
|
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
index.views.push_back(view);
|
||||||
|
if (Bp == patch->ble)
|
||||||
|
break;
|
||||||
|
Bp = Bp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int nblocks = int(index.views.size());
|
||||||
|
if (nblocks <= 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
|
||||||
|
bins_1d = clamp_int(bins_1d, 1, 32);
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
|
{
|
||||||
|
index.bins[i] = bins_1d;
|
||||||
|
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
|
||||||
|
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
|
||||||
|
if (hi > index.lo[i] && bins_1d > 1)
|
||||||
|
index.inv[i] = bins_1d / (hi - index.lo[i]);
|
||||||
|
else
|
||||||
|
index.inv[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
|
||||||
|
|
||||||
|
for (int bi = 0; bi < nblocks; bi++)
|
||||||
|
{
|
||||||
|
const InterpBlockView &view = index.views[bi];
|
||||||
|
int bmin[dim], bmax[dim];
|
||||||
|
for (int d = 0; d < dim; d++)
|
||||||
|
{
|
||||||
|
const double low = view.llb[d] - DH[d] / 2;
|
||||||
|
const double up = view.uub[d] + DH[d] / 2;
|
||||||
|
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
|
||||||
|
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
|
||||||
|
if (bmax[d] < bmin[d])
|
||||||
|
{
|
||||||
|
int t = bmin[d];
|
||||||
|
bmin[d] = bmax[d];
|
||||||
|
bmax[d] = t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
|
||||||
|
for (int by = bmin[1]; by <= bmax[1]; by++)
|
||||||
|
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
|
||||||
|
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
|
||||||
|
}
|
||||||
|
|
||||||
|
index.valid = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||||
|
{
|
||||||
|
if (!index.valid)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
|
||||||
|
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
|
||||||
|
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
|
||||||
|
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
|
||||||
|
|
||||||
|
for (size_t ci = 0; ci < cand.size(); ci++)
|
||||||
|
{
|
||||||
|
const int bi = cand[ci];
|
||||||
|
if (point_in_block_view(index.views[bi], pox, DH))
|
||||||
|
return bi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to full scan for numerical edge cases around bin boundaries.
|
||||||
|
for (size_t bi = 0; bi < index.views.size(); bi++)
|
||||||
|
if (point_in_block_view(index.views[bi], pox, DH))
|
||||||
|
return int(bi);
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||||
{
|
{
|
||||||
|
|
||||||
@@ -367,9 +530,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
owner_rank[j] = -1;
|
owner_rank[j] = -1;
|
||||||
|
|
||||||
double DH[dim], llb[dim], uub[dim];
|
double DH[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
|
BlockBinIndex block_index;
|
||||||
|
build_block_bin_index(this, DH, block_index);
|
||||||
|
|
||||||
for (int j = 0; j < NN; j++) // run along points
|
for (int j = 0; j < NN; j++) // run along points
|
||||||
{
|
{
|
||||||
@@ -392,57 +557,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Block> *Bp = blb;
|
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||||
bool notfind = true;
|
if (block_i >= 0)
|
||||||
while (notfind && Bp) // run along Blocks
|
|
||||||
{
|
{
|
||||||
Block *BP = Bp->data;
|
Block *BP = block_index.views[block_i].bp;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
bool flag = true;
|
if (myrank == BP->rank)
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
{
|
||||||
#ifdef Vertex
|
//---> interpolation
|
||||||
#ifdef Cell
|
varl = VarList;
|
||||||
#error Both Cell and Vertex are defined
|
int k = 0;
|
||||||
#endif
|
while (varl) // run along variables
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
|
||||||
#else
|
|
||||||
#ifdef Cell
|
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
|
||||||
#else
|
|
||||||
#error Not define Vertex nor Cell
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
|
||||||
{
|
{
|
||||||
flag = false;
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
break;
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flag)
|
|
||||||
{
|
|
||||||
notfind = false;
|
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
|
||||||
{
|
|
||||||
//---> interpolation
|
|
||||||
varl = VarList;
|
|
||||||
int k = 0;
|
|
||||||
while (varl) // run along variables
|
|
||||||
{
|
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Bp == ble)
|
|
||||||
break;
|
|
||||||
Bp = Bp->next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -535,9 +667,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
owner_rank[j] = -1;
|
owner_rank[j] = -1;
|
||||||
|
|
||||||
double DH[dim], llb[dim], uub[dim];
|
double DH[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
|
BlockBinIndex block_index;
|
||||||
|
build_block_bin_index(this, DH, block_index);
|
||||||
|
|
||||||
// --- Interpolation phase (identical to original) ---
|
// --- Interpolation phase (identical to original) ---
|
||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
@@ -561,56 +695,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Block> *Bp = blb;
|
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||||
bool notfind = true;
|
if (block_i >= 0)
|
||||||
while (notfind && Bp)
|
|
||||||
{
|
{
|
||||||
Block *BP = Bp->data;
|
Block *BP = block_index.views[block_i].bp;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
bool flag = true;
|
if (myrank == BP->rank)
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
{
|
||||||
#ifdef Vertex
|
varl = VarList;
|
||||||
#ifdef Cell
|
int k = 0;
|
||||||
#error Both Cell and Vertex are defined
|
while (varl)
|
||||||
#endif
|
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
|
||||||
#else
|
|
||||||
#ifdef Cell
|
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
|
||||||
#else
|
|
||||||
#error Not define Vertex nor Cell
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
|
||||||
{
|
{
|
||||||
flag = false;
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
break;
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flag)
|
|
||||||
{
|
|
||||||
notfind = false;
|
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
|
||||||
{
|
|
||||||
varl = VarList;
|
|
||||||
int k = 0;
|
|
||||||
while (varl)
|
|
||||||
{
|
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Bp == ble)
|
|
||||||
break;
|
|
||||||
Bp = Bp->next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -833,9 +934,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
||||||
MPI_Comm_group(Comm_here, &local_group);
|
MPI_Comm_group(Comm_here, &local_group);
|
||||||
|
|
||||||
double DH[dim], llb[dim], uub[dim];
|
double DH[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
|
BlockBinIndex block_index;
|
||||||
|
build_block_bin_index(this, DH, block_index);
|
||||||
|
|
||||||
for (int j = 0; j < NN; j++) // run along points
|
for (int j = 0; j < NN; j++) // run along points
|
||||||
{
|
{
|
||||||
@@ -858,57 +961,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Block> *Bp = blb;
|
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||||
bool notfind = true;
|
if (block_i >= 0)
|
||||||
while (notfind && Bp) // run along Blocks
|
|
||||||
{
|
{
|
||||||
Block *BP = Bp->data;
|
Block *BP = block_index.views[block_i].bp;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
bool flag = true;
|
if (myrank == BP->rank)
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
{
|
||||||
#ifdef Vertex
|
//---> interpolation
|
||||||
#ifdef Cell
|
varl = VarList;
|
||||||
#error Both Cell and Vertex are defined
|
int k = 0;
|
||||||
#endif
|
while (varl) // run along variables
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
|
||||||
#else
|
|
||||||
#ifdef Cell
|
|
||||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
|
||||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
|
||||||
#else
|
|
||||||
#error Not define Vertex nor Cell
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
|
||||||
{
|
{
|
||||||
flag = false;
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
break;
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flag)
|
|
||||||
{
|
|
||||||
notfind = false;
|
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
|
||||||
{
|
|
||||||
//---> interpolation
|
|
||||||
varl = VarList;
|
|
||||||
int k = 0;
|
|
||||||
while (varl) // run along variables
|
|
||||||
{
|
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Bp == ble)
|
|
||||||
break;
|
|
||||||
Bp = Bp->next;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3893,66 +3893,105 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
|||||||
|
|
||||||
int node;
|
int node;
|
||||||
|
|
||||||
MPI_Request *reqs;
|
MPI_Request *reqs = new MPI_Request[2 * cpusize];
|
||||||
MPI_Status *stats;
|
MPI_Status *stats = new MPI_Status[2 * cpusize];
|
||||||
reqs = new MPI_Request[2 * cpusize];
|
int *req_node = new int[2 * cpusize];
|
||||||
stats = new MPI_Status[2 * cpusize];
|
int *req_is_recv = new int[2 * cpusize];
|
||||||
|
int *completed = new int[2 * cpusize];
|
||||||
int req_no = 0;
|
int req_no = 0;
|
||||||
|
int pending_recv = 0;
|
||||||
|
|
||||||
double **send_data, **rec_data;
|
double **send_data = new double *[cpusize];
|
||||||
send_data = new double *[cpusize];
|
double **rec_data = new double *[cpusize];
|
||||||
rec_data = new double *[cpusize];
|
int *send_lengths = new int[cpusize];
|
||||||
int length;
|
int *recv_lengths = new int[cpusize];
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
send_data[node] = rec_data[node] = 0;
|
send_data[node] = rec_data[node] = 0;
|
||||||
if (node == myrank)
|
send_lengths[node] = recv_lengths[node] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post receives first so peers can progress rendezvous early.
|
||||||
|
for (node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (recv_lengths[node] > 0)
|
||||||
{
|
{
|
||||||
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
rec_data[node] = new double[recv_lengths[node]];
|
||||||
|
if (!rec_data[node])
|
||||||
{
|
{
|
||||||
rec_data[node] = new double[length];
|
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||||
if (!rec_data[node])
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
{
|
|
||||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
}
|
}
|
||||||
|
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 1;
|
||||||
|
req_no++;
|
||||||
|
pending_recv++;
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
|
|
||||||
|
// Local transfer on this rank.
|
||||||
|
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (recv_lengths[myrank] > 0)
|
||||||
|
{
|
||||||
|
rec_data[myrank] = new double[recv_lengths[myrank]];
|
||||||
|
if (!rec_data[myrank])
|
||||||
{
|
{
|
||||||
// send from this cpu to cpu#node
|
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||||
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack and post sends.
|
||||||
|
for (node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (send_lengths[node] > 0)
|
||||||
|
{
|
||||||
|
send_data[node] = new double[send_lengths[node]];
|
||||||
|
if (!send_data[node])
|
||||||
{
|
{
|
||||||
send_data[node] = new double[length];
|
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||||
if (!send_data[node])
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
{
|
|
||||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
// receive from cpu#node to this cpu
|
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
|
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 0;
|
||||||
|
req_no++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unpack as soon as receive completes to reduce pure wait time.
|
||||||
|
while (pending_recv > 0)
|
||||||
|
{
|
||||||
|
int outcount = 0;
|
||||||
|
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||||
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
|
|
||||||
|
for (int i = 0; i < outcount; i++)
|
||||||
|
{
|
||||||
|
int idx = completed[i];
|
||||||
|
if (idx >= 0 && req_is_recv[idx])
|
||||||
{
|
{
|
||||||
rec_data[node] = new double[length];
|
int recv_node = req_node[idx];
|
||||||
if (!rec_data[node])
|
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
{
|
pending_recv--;
|
||||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// wait for all requests to complete
|
|
||||||
MPI_Waitall(req_no, reqs, stats);
|
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||||
if (rec_data[node])
|
|
||||||
data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
if (rec_data[myrank])
|
||||||
|
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
@@ -3964,8 +4003,13 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
|||||||
|
|
||||||
delete[] reqs;
|
delete[] reqs;
|
||||||
delete[] stats;
|
delete[] stats;
|
||||||
|
delete[] req_node;
|
||||||
|
delete[] req_is_recv;
|
||||||
|
delete[] completed;
|
||||||
delete[] send_data;
|
delete[] send_data;
|
||||||
delete[] rec_data;
|
delete[] rec_data;
|
||||||
|
delete[] send_lengths;
|
||||||
|
delete[] recv_lengths;
|
||||||
}
|
}
|
||||||
//
|
//
|
||||||
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||||
@@ -3978,66 +4022,105 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
|||||||
|
|
||||||
int node;
|
int node;
|
||||||
|
|
||||||
MPI_Request *reqs;
|
MPI_Request *reqs = new MPI_Request[2 * cpusize];
|
||||||
MPI_Status *stats;
|
MPI_Status *stats = new MPI_Status[2 * cpusize];
|
||||||
reqs = new MPI_Request[2 * cpusize];
|
int *req_node = new int[2 * cpusize];
|
||||||
stats = new MPI_Status[2 * cpusize];
|
int *req_is_recv = new int[2 * cpusize];
|
||||||
|
int *completed = new int[2 * cpusize];
|
||||||
int req_no = 0;
|
int req_no = 0;
|
||||||
|
int pending_recv = 0;
|
||||||
|
|
||||||
double **send_data, **rec_data;
|
double **send_data = new double *[cpusize];
|
||||||
send_data = new double *[cpusize];
|
double **rec_data = new double *[cpusize];
|
||||||
rec_data = new double *[cpusize];
|
int *send_lengths = new int[cpusize];
|
||||||
int length;
|
int *recv_lengths = new int[cpusize];
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
send_data[node] = rec_data[node] = 0;
|
send_data[node] = rec_data[node] = 0;
|
||||||
if (node == myrank)
|
send_lengths[node] = recv_lengths[node] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post receives first so peers can progress rendezvous early.
|
||||||
|
for (node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (recv_lengths[node] > 0)
|
||||||
{
|
{
|
||||||
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
rec_data[node] = new double[recv_lengths[node]];
|
||||||
|
if (!rec_data[node])
|
||||||
{
|
{
|
||||||
rec_data[node] = new double[length];
|
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||||
if (!rec_data[node])
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
{
|
|
||||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
}
|
}
|
||||||
|
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 1;
|
||||||
|
req_no++;
|
||||||
|
pending_recv++;
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
|
|
||||||
|
// Local transfer on this rank.
|
||||||
|
recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (recv_lengths[myrank] > 0)
|
||||||
|
{
|
||||||
|
rec_data[myrank] = new double[recv_lengths[myrank]];
|
||||||
|
if (!rec_data[myrank])
|
||||||
{
|
{
|
||||||
// send from this cpu to cpu#node
|
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||||
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack and post sends.
|
||||||
|
for (node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
if (send_lengths[node] > 0)
|
||||||
|
{
|
||||||
|
send_data[node] = new double[send_lengths[node]];
|
||||||
|
if (!send_data[node])
|
||||||
{
|
{
|
||||||
send_data[node] = new double[length];
|
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||||
if (!send_data[node])
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
{
|
|
||||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
// receive from cpu#node to this cpu
|
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
|
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 0;
|
||||||
|
req_no++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unpack as soon as receive completes to reduce pure wait time.
|
||||||
|
while (pending_recv > 0)
|
||||||
|
{
|
||||||
|
int outcount = 0;
|
||||||
|
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||||
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
|
|
||||||
|
for (int i = 0; i < outcount; i++)
|
||||||
|
{
|
||||||
|
int idx = completed[i];
|
||||||
|
if (idx >= 0 && req_is_recv[idx])
|
||||||
{
|
{
|
||||||
rec_data[node] = new double[length];
|
int recv_node = req_node[idx];
|
||||||
if (!rec_data[node])
|
data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
{
|
pending_recv--;
|
||||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// wait for all requests to complete
|
|
||||||
MPI_Waitall(req_no, reqs, stats);
|
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||||
if (rec_data[node])
|
|
||||||
data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
if (rec_data[myrank])
|
||||||
|
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
@@ -4049,8 +4132,13 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
|||||||
|
|
||||||
delete[] reqs;
|
delete[] reqs;
|
||||||
delete[] stats;
|
delete[] stats;
|
||||||
|
delete[] req_node;
|
||||||
|
delete[] req_is_recv;
|
||||||
|
delete[] completed;
|
||||||
delete[] send_data;
|
delete[] send_data;
|
||||||
delete[] rec_data;
|
delete[] rec_data;
|
||||||
|
delete[] send_lengths;
|
||||||
|
delete[] recv_lengths;
|
||||||
}
|
}
|
||||||
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||||
{
|
{
|
||||||
@@ -4232,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
|
|||||||
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
||||||
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
||||||
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
||||||
lengths_valid(false)
|
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
// SyncCache invalidate: free grid segment lists but keep buffers
|
// SyncCache invalidate: free grid segment lists but keep buffers
|
||||||
@@ -4271,11 +4359,15 @@ void Parallel::SyncCache::destroy()
|
|||||||
if (recv_bufs) delete[] recv_bufs;
|
if (recv_bufs) delete[] recv_bufs;
|
||||||
if (reqs) delete[] reqs;
|
if (reqs) delete[] reqs;
|
||||||
if (stats) delete[] stats;
|
if (stats) delete[] stats;
|
||||||
|
if (tc_req_node) delete[] tc_req_node;
|
||||||
|
if (tc_req_is_recv) delete[] tc_req_is_recv;
|
||||||
|
if (tc_completed) delete[] tc_completed;
|
||||||
combined_src = combined_dst = 0;
|
combined_src = combined_dst = 0;
|
||||||
send_lengths = recv_lengths = 0;
|
send_lengths = recv_lengths = 0;
|
||||||
send_buf_caps = recv_buf_caps = 0;
|
send_buf_caps = recv_buf_caps = 0;
|
||||||
send_bufs = recv_bufs = 0;
|
send_bufs = recv_bufs = 0;
|
||||||
reqs = 0; stats = 0;
|
reqs = 0; stats = 0;
|
||||||
|
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
|
||||||
cpusize = 0; max_reqs = 0;
|
cpusize = 0; max_reqs = 0;
|
||||||
}
|
}
|
||||||
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
||||||
@@ -4289,64 +4381,96 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
int cpusize = cache.cpusize;
|
int cpusize = cache.cpusize;
|
||||||
|
|
||||||
int req_no = 0;
|
int req_no = 0;
|
||||||
|
int pending_recv = 0;
|
||||||
int node;
|
int node;
|
||||||
|
int *req_node = cache.tc_req_node;
|
||||||
|
int *req_is_recv = cache.tc_req_is_recv;
|
||||||
|
int *completed = cache.tc_completed;
|
||||||
|
|
||||||
|
// Post receives first so peers can progress rendezvous early.
|
||||||
for (node = 0; node < cpusize; node++)
|
for (node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
if (node == myrank)
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.recv_lengths[node] = rlength;
|
||||||
|
if (rlength > 0)
|
||||||
{
|
{
|
||||||
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
if (rlength > cache.recv_buf_caps[node])
|
||||||
cache.recv_lengths[node] = length;
|
|
||||||
if (length > 0)
|
|
||||||
{
|
{
|
||||||
if (length > cache.recv_buf_caps[node])
|
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||||
{
|
cache.recv_bufs[node] = new double[rlength];
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
cache.recv_buf_caps[node] = rlength;
|
||||||
cache.recv_bufs[node] = new double[length];
|
|
||||||
cache.recv_buf_caps[node] = length;
|
|
||||||
}
|
|
||||||
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
}
|
}
|
||||||
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 1;
|
||||||
|
req_no++;
|
||||||
|
pending_recv++;
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
|
|
||||||
|
// Local transfer on this rank.
|
||||||
|
int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.recv_lengths[myrank] = self_len;
|
||||||
|
if (self_len > 0)
|
||||||
|
{
|
||||||
|
if (self_len > cache.recv_buf_caps[myrank])
|
||||||
{
|
{
|
||||||
// send
|
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
||||||
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
cache.recv_bufs[myrank] = new double[self_len];
|
||||||
cache.send_lengths[node] = slength;
|
cache.recv_buf_caps[myrank] = self_len;
|
||||||
if (slength > 0)
|
}
|
||||||
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack and post sends.
|
||||||
|
for (node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.send_lengths[node] = slength;
|
||||||
|
if (slength > 0)
|
||||||
|
{
|
||||||
|
if (slength > cache.send_buf_caps[node])
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||||
{
|
cache.send_bufs[node] = new double[slength];
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
cache.send_buf_caps[node] = slength;
|
||||||
cache.send_bufs[node] = new double[slength];
|
|
||||||
cache.send_buf_caps[node] = slength;
|
|
||||||
}
|
|
||||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
// recv
|
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
cache.recv_lengths[node] = rlength;
|
req_node[req_no] = node;
|
||||||
if (rlength > 0)
|
req_is_recv[req_no] = 0;
|
||||||
|
req_no++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unpack as soon as receive completes to reduce pure wait time.
|
||||||
|
while (pending_recv > 0)
|
||||||
|
{
|
||||||
|
int outcount = 0;
|
||||||
|
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||||
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
|
|
||||||
|
for (int i = 0; i < outcount; i++)
|
||||||
|
{
|
||||||
|
int idx = completed[i];
|
||||||
|
if (idx >= 0 && req_is_recv[idx])
|
||||||
{
|
{
|
||||||
if (rlength > cache.recv_buf_caps[node])
|
int recv_node_i = req_node[idx];
|
||||||
{
|
data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
pending_recv--;
|
||||||
cache.recv_bufs[node] = new double[rlength];
|
|
||||||
cache.recv_buf_caps[node] = rlength;
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Waitall(req_no, cache.reqs, cache.stats);
|
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||||
|
|
||||||
for (node = 0; node < cpusize; node++)
|
if (self_len > 0)
|
||||||
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
|
||||||
}
|
}
|
||||||
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
|
|
||||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||||
{
|
{
|
||||||
if (!cache.valid)
|
if (!cache.valid)
|
||||||
@@ -4374,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
@@ -4474,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
@@ -4544,6 +4674,11 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
int cpusize = cache.cpusize;
|
int cpusize = cache.cpusize;
|
||||||
state.req_no = 0;
|
state.req_no = 0;
|
||||||
state.active = true;
|
state.active = true;
|
||||||
|
state.pending_recv = 0;
|
||||||
|
// Allocate tracking arrays
|
||||||
|
delete[] state.req_node; delete[] state.req_is_recv;
|
||||||
|
state.req_node = new int[cache.max_reqs];
|
||||||
|
state.req_is_recv = new int[cache.max_reqs];
|
||||||
|
|
||||||
MyList<Parallel::gridseg> **src = cache.combined_src;
|
MyList<Parallel::gridseg> **src = cache.combined_src;
|
||||||
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
||||||
@@ -4588,6 +4723,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
cache.send_buf_caps[node] = slength;
|
cache.send_buf_caps[node] = slength;
|
||||||
}
|
}
|
||||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
|
state.req_node[state.req_no] = node;
|
||||||
|
state.req_is_recv[state.req_no] = 0;
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||||
}
|
}
|
||||||
int rlength;
|
int rlength;
|
||||||
@@ -4605,29 +4742,60 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
cache.recv_bufs[node] = new double[rlength];
|
cache.recv_bufs[node] = new double[rlength];
|
||||||
cache.recv_buf_caps[node] = rlength;
|
cache.recv_buf_caps[node] = rlength;
|
||||||
}
|
}
|
||||||
|
state.req_node[state.req_no] = node;
|
||||||
|
state.req_is_recv[state.req_no] = 1;
|
||||||
|
state.pending_recv++;
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cache.lengths_valid = true;
|
cache.lengths_valid = true;
|
||||||
}
|
}
|
||||||
// Sync_finish: wait for async MPI operations and unpack
|
// Sync_finish: progressive unpack as receives complete, then wait for sends
|
||||||
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
||||||
MyList<var> *VarList, int Symmetry)
|
MyList<var> *VarList, int Symmetry)
|
||||||
{
|
{
|
||||||
if (!state.active)
|
if (!state.active)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
MPI_Waitall(state.req_no, cache.reqs, cache.stats);
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
int cpusize = cache.cpusize;
|
|
||||||
MyList<Parallel::gridseg> **src = cache.combined_src;
|
MyList<Parallel::gridseg> **src = cache.combined_src;
|
||||||
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
// Unpack local data first (no MPI needed)
|
||||||
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
|
if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
|
||||||
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
|
||||||
|
|
||||||
|
// Progressive unpack of remote receives
|
||||||
|
if (state.pending_recv > 0 && state.req_no > 0)
|
||||||
|
{
|
||||||
|
int pending = state.pending_recv;
|
||||||
|
int *completed = new int[cache.max_reqs];
|
||||||
|
while (pending > 0)
|
||||||
|
{
|
||||||
|
int outcount = 0;
|
||||||
|
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||||
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
|
for (int i = 0; i < outcount; i++)
|
||||||
|
{
|
||||||
|
int idx = completed[i];
|
||||||
|
if (idx >= 0 && state.req_is_recv[idx])
|
||||||
|
{
|
||||||
|
int recv_node = state.req_node[idx];
|
||||||
|
data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
|
||||||
|
pending--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete[] completed;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for remaining sends
|
||||||
|
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
|
||||||
|
|
||||||
|
delete[] state.req_node; state.req_node = 0;
|
||||||
|
delete[] state.req_is_recv; state.req_is_recv = 0;
|
||||||
state.active = false;
|
state.active = false;
|
||||||
}
|
}
|
||||||
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
|
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
|
||||||
@@ -5694,6 +5862,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
|
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
|
||||||
@@ -5740,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||||
@@ -5786,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
}
|
}
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
|
||||||
@@ -5807,58 +5984,98 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
int cpusize = cache.cpusize;
|
int cpusize = cache.cpusize;
|
||||||
|
|
||||||
int req_no = 0;
|
int req_no = 0;
|
||||||
|
int pending_recv = 0;
|
||||||
|
int *req_node = new int[cache.max_reqs];
|
||||||
|
int *req_is_recv = new int[cache.max_reqs];
|
||||||
|
int *completed = new int[cache.max_reqs];
|
||||||
|
|
||||||
|
// Post receives first so peers can progress rendezvous early.
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
if (node == myrank)
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.recv_lengths[node] = rlength;
|
||||||
|
if (rlength > 0)
|
||||||
{
|
{
|
||||||
int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
if (rlength > cache.recv_buf_caps[node])
|
||||||
cache.recv_lengths[node] = length;
|
|
||||||
if (length > 0)
|
|
||||||
{
|
{
|
||||||
if (length > cache.recv_buf_caps[node])
|
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||||
{
|
cache.recv_bufs[node] = new double[rlength];
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
cache.recv_buf_caps[node] = rlength;
|
||||||
cache.recv_bufs[node] = new double[length];
|
|
||||||
cache.recv_buf_caps[node] = length;
|
|
||||||
}
|
|
||||||
data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
}
|
}
|
||||||
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 1;
|
||||||
|
req_no++;
|
||||||
|
pending_recv++;
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
|
|
||||||
|
// Local transfer on this rank.
|
||||||
|
int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.recv_lengths[myrank] = self_len;
|
||||||
|
if (self_len > 0)
|
||||||
|
{
|
||||||
|
if (self_len > cache.recv_buf_caps[myrank])
|
||||||
{
|
{
|
||||||
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
||||||
cache.send_lengths[node] = slength;
|
cache.recv_bufs[myrank] = new double[self_len];
|
||||||
if (slength > 0)
|
cache.recv_buf_caps[myrank] = self_len;
|
||||||
|
}
|
||||||
|
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pack and post sends.
|
||||||
|
for (int node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank) continue;
|
||||||
|
|
||||||
|
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
|
cache.send_lengths[node] = slength;
|
||||||
|
if (slength > 0)
|
||||||
|
{
|
||||||
|
if (slength > cache.send_buf_caps[node])
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||||
{
|
cache.send_bufs[node] = new double[slength];
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
cache.send_buf_caps[node] = slength;
|
||||||
cache.send_bufs[node] = new double[slength];
|
|
||||||
cache.send_buf_caps[node] = slength;
|
|
||||||
}
|
|
||||||
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
cache.recv_lengths[node] = rlength;
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
if (rlength > 0)
|
req_node[req_no] = node;
|
||||||
|
req_is_recv[req_no] = 0;
|
||||||
|
req_no++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unpack as soon as receive completes to reduce pure wait time.
|
||||||
|
while (pending_recv > 0)
|
||||||
|
{
|
||||||
|
int outcount = 0;
|
||||||
|
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||||
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
|
|
||||||
|
for (int i = 0; i < outcount; i++)
|
||||||
|
{
|
||||||
|
int idx = completed[i];
|
||||||
|
if (idx >= 0 && req_is_recv[idx])
|
||||||
{
|
{
|
||||||
if (rlength > cache.recv_buf_caps[node])
|
int recv_node_i = req_node[idx];
|
||||||
{
|
data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
pending_recv--;
|
||||||
cache.recv_bufs[node] = new double[rlength];
|
|
||||||
cache.recv_buf_caps[node] = rlength;
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Waitall(req_no, cache.reqs, cache.stats);
|
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
if (self_len > 0)
|
||||||
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
|
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
|
||||||
|
delete[] req_node;
|
||||||
|
delete[] req_is_recv;
|
||||||
|
delete[] completed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// collect all buffer grid segments or blocks for given patch
|
// collect all buffer grid segments or blocks for given patch
|
||||||
|
|||||||
@@ -108,6 +108,9 @@ namespace Parallel
|
|||||||
MPI_Status *stats;
|
MPI_Status *stats;
|
||||||
int max_reqs;
|
int max_reqs;
|
||||||
bool lengths_valid;
|
bool lengths_valid;
|
||||||
|
int *tc_req_node;
|
||||||
|
int *tc_req_is_recv;
|
||||||
|
int *tc_completed;
|
||||||
SyncCache();
|
SyncCache();
|
||||||
void invalidate();
|
void invalidate();
|
||||||
void destroy();
|
void destroy();
|
||||||
@@ -121,7 +124,10 @@ namespace Parallel
|
|||||||
struct AsyncSyncState {
|
struct AsyncSyncState {
|
||||||
int req_no;
|
int req_no;
|
||||||
bool active;
|
bool active;
|
||||||
AsyncSyncState() : req_no(0), active(false) {}
|
int *req_node;
|
||||||
|
int *req_is_recv;
|
||||||
|
int pending_recv;
|
||||||
|
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
||||||
|
|||||||
@@ -736,6 +736,8 @@ void bssn_class::Initialize()
|
|||||||
sync_cache_cor = new Parallel::SyncCache[GH->levels];
|
sync_cache_cor = new Parallel::SyncCache[GH->levels];
|
||||||
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
|
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
|
||||||
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
||||||
|
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
|
||||||
|
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -2213,7 +2215,7 @@ void bssn_class::Evolve(int Steps)
|
|||||||
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
||||||
@@ -2429,7 +2431,7 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2608,7 +2610,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2775,7 +2777,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2790,7 +2792,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2809,7 +2811,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -2825,7 +2827,7 @@ void bssn_class::ParallelStep()
|
|||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
// a_stream.str("");
|
// a_stream.str("");
|
||||||
@@ -5746,6 +5748,12 @@ void bssn_class::SHStep()
|
|||||||
// 0: do not use mixing two levels data for OutBD; 1: do use
|
// 0: do not use mixing two levels data for OutBD; 1: do use
|
||||||
|
|
||||||
#define MIXOUTB 0
|
#define MIXOUTB 0
|
||||||
|
// In the cached Restrict->OutBdLow2Hi path, coarse Sync is usually redundant:
|
||||||
|
// OutBdLow2Hi_cached reads coarse owned cells (build_owned_gsl type-4), not coarse ghost/buffer cells.
|
||||||
|
// Keep a switch to restore the old behavior if needed for debugging.
|
||||||
|
#ifndef RP_SYNC_COARSE_AFTER_RESTRICT
|
||||||
|
#define RP_SYNC_COARSE_AFTER_RESTRICT 0
|
||||||
|
#endif
|
||||||
void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||||
MyList<var> *SL, MyList<var> *OL, MyList<var> *corL)
|
MyList<var> *SL, MyList<var> *OL, MyList<var> *corL)
|
||||||
// we assume
|
// we assume
|
||||||
@@ -5796,7 +5804,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||||
@@ -5809,7 +5817,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (PSTR == 1 || PSTR == 2)
|
#if (PSTR == 1 || PSTR == 2)
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -5820,7 +5830,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -5847,7 +5857,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
||||||
@@ -5860,7 +5870,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (PSTR == 1 || PSTR == 2)
|
#if (PSTR == 1 || PSTR == 2)
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -5871,7 +5883,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -5940,17 +5952,19 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -5962,17 +5976,19 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
|||||||
else // no time refinement levels and for all same time levels
|
else // no time refinement levels and for all same time levels
|
||||||
{
|
{
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -6027,17 +6043,19 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -6051,17 +6069,19 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
|||||||
if (myrank == 0)
|
if (myrank == 0)
|
||||||
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
|
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
|
||||||
#elif (RPB == 1)
|
#elif (RPB == 1)
|
||||||
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
|
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
|
||||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
|
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -6102,7 +6122,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
|||||||
|
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -6115,7 +6135,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
|||||||
{
|
{
|
||||||
#if (RPB == 0)
|
#if (RPB == 0)
|
||||||
#if (MIXOUTB == 0)
|
#if (MIXOUTB == 0)
|
||||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
|
||||||
#elif (MIXOUTB == 1)
|
#elif (MIXOUTB == 1)
|
||||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
@@ -6134,13 +6154,16 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
|||||||
#else
|
#else
|
||||||
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
||||||
#endif
|
#endif
|
||||||
|
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
|
||||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#undef MIXOUTB
|
#undef MIXOUTB
|
||||||
|
#undef RP_SYNC_COARSE_AFTER_RESTRICT
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
|
|
||||||
|
|||||||
@@ -130,6 +130,8 @@ public:
|
|||||||
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
|
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
|
||||||
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
|
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
|
||||||
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
|
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
|
||||||
|
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
|
||||||
|
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
|
||||||
|
|
||||||
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
||||||
monitor *ConVMonitor;
|
monitor *ConVMonitor;
|
||||||
|
|||||||
@@ -716,7 +716,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
|
|
||||||
// 24ms //
|
// 24ms //
|
||||||
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
|
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
|
||||||
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
|
|
||||||
|
|
||||||
// 6ms //
|
// 6ms //
|
||||||
for (int i=0;i<all;i+=1) {
|
for (int i=0;i<all;i+=1) {
|
||||||
@@ -1014,12 +1013,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
betaz_rhs[i] = FF * dtSfz[i];
|
betaz_rhs[i] = FF * dtSfz[i];
|
||||||
|
|
||||||
reta[i] =
|
reta[i] =
|
||||||
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
|
gupxx[i] * chix[i] * chix[i]
|
||||||
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
|
+ gupyy[i] * chiy[i] * chiy[i]
|
||||||
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
|
+ gupzz[i] * chiz[i] * chiz[i]
|
||||||
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
|
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
|
||||||
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
|
+ gupxz[i] * chix[i] * chiz[i]
|
||||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
+ gupyz[i] * chiy[i] * chiz[i] );
|
||||||
|
|
||||||
#if (GAUGE == 2)
|
#if (GAUGE == 2)
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
||||||
@@ -1032,12 +1031,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
||||||
#elif (GAUGE == 4 || GAUGE == 5)
|
#elif (GAUGE == 4 || GAUGE == 5)
|
||||||
reta[i] =
|
reta[i] =
|
||||||
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
|
gupxx[i] * chix[i] * chix[i]
|
||||||
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
|
+ gupyy[i] * chiy[i] * chiy[i]
|
||||||
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
|
+ gupzz[i] * chiz[i] * chiz[i]
|
||||||
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
|
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
|
||||||
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
|
+ gupxz[i] * chix[i] * chiz[i]
|
||||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
+ gupyz[i] * chiy[i] * chiz[i] );
|
||||||
|
|
||||||
#if (GAUGE == 4)
|
#if (GAUGE == 4)
|
||||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
||||||
@@ -1139,59 +1138,59 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
|
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
|
||||||
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
|
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
|
||||||
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
|
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
|
||||||
}
|
// 7ms //
|
||||||
// 7ms //
|
for (int i=0;i<all;i+=1) {
|
||||||
for (int i=0;i<all;i+=1) {
|
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
|
||||||
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
|
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
|
||||||
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
|
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
|
||||||
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
|
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
|
||||||
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
|
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
|
||||||
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
|
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
|
||||||
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
|
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
|
||||||
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
|
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
|
||||||
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
|
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
|
||||||
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
|
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
|
||||||
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
|
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
|
||||||
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
|
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
|
||||||
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
|
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
|
||||||
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
|
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
|
||||||
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
|
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
|
||||||
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
|
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
|
||||||
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
|
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
|
||||||
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
|
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
|
||||||
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
|
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
|
||||||
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
|
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
|
||||||
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
|
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
|
||||||
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
|
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
|
||||||
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
|
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
|
||||||
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
|
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
|
||||||
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
|
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
|
||||||
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
|
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
|
||||||
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
|
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
|
||||||
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
|
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
|
||||||
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
|
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
|
||||||
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
|
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
|
||||||
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
|
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
|
||||||
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
|
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
|
||||||
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
|
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
|
||||||
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
|
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
|
||||||
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
|
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
|
||||||
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
|
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
|
||||||
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
|
|
||||||
|
|
||||||
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
|
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
|
||||||
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
|
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
|
||||||
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
|
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
|
||||||
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
|
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
|
||||||
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
|
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
|
||||||
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
|
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
|
||||||
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
|
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
|
||||||
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
|
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
|
||||||
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
|
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
|
||||||
|
|
||||||
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
|
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
|
||||||
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
|
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
|
||||||
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
|
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1513,6 +1513,7 @@
|
|||||||
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh
|
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh
|
||||||
real*8, dimension(3) :: SoA
|
real*8, dimension(3) :: SoA
|
||||||
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
|
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
|
||||||
|
integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
|
||||||
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
|
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
|
||||||
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
|
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
|
||||||
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
|
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
|
||||||
@@ -1565,9 +1566,47 @@
|
|||||||
fxz = ZEO
|
fxz = ZEO
|
||||||
fyz = ZEO
|
fyz = ZEO
|
||||||
|
|
||||||
|
i_core_min = max(1, imin+2)
|
||||||
|
i_core_max = min(ex(1), imax-2)
|
||||||
|
j_core_min = max(1, jmin+2)
|
||||||
|
j_core_max = min(ex(2), jmax-2)
|
||||||
|
k_core_min = max(1, kmin+2)
|
||||||
|
k_core_max = min(ex(3), kmax-2)
|
||||||
|
|
||||||
|
if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
|
||||||
|
do k=k_core_min,k_core_max
|
||||||
|
do j=j_core_min,j_core_max
|
||||||
|
do i=i_core_min,i_core_max
|
||||||
|
! interior points always use 4th-order stencils without branch checks
|
||||||
|
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
|
||||||
|
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
|
||||||
|
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
|
||||||
|
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
|
||||||
|
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
|
||||||
|
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
|
||||||
|
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
|
||||||
|
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
|
||||||
|
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
|
||||||
|
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
|
||||||
|
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
|
||||||
|
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
|
||||||
|
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
|
||||||
|
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
|
||||||
|
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
|
||||||
|
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
|
||||||
|
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
|
||||||
|
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
|
||||||
|
enddo
|
||||||
|
enddo
|
||||||
|
enddo
|
||||||
|
endif
|
||||||
|
|
||||||
do k=1,ex(3)
|
do k=1,ex(3)
|
||||||
do j=1,ex(2)
|
do j=1,ex(2)
|
||||||
do i=1,ex(1)
|
do i=1,ex(1)
|
||||||
|
if(i>=i_core_min .and. i<=i_core_max .and. &
|
||||||
|
j>=j_core_min .and. j<=j_core_max .and. &
|
||||||
|
k>=k_core_min .and. k<=k_core_max) cycle
|
||||||
!~~~~~~ fxx
|
!~~~~~~ fxx
|
||||||
if(i+2 <= imax .and. i-2 >= imin)then
|
if(i+2 <= imax .and. i-2 >= imin)then
|
||||||
!
|
!
|
||||||
|
|||||||
@@ -141,12 +141,26 @@ void fdderivs(const int ex[3],
|
|||||||
const int j4_hi = ex2 - 3;
|
const int j4_hi = ex2 - 3;
|
||||||
const int k4_hi = ex3 - 3;
|
const int k4_hi = ex3 - 3;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Strategy A:
|
||||||
|
* Avoid redundant work in overlap of 2nd/4th-order regions.
|
||||||
|
* Only compute 2nd-order on shell points that are NOT overwritten by
|
||||||
|
* the 4th-order pass.
|
||||||
|
*/
|
||||||
|
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
|
||||||
|
|
||||||
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
||||||
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
||||||
|
if (has4 &&
|
||||||
|
i0 >= i4_lo && i0 <= i4_hi &&
|
||||||
|
j0 >= j4_lo && j0 <= j4_hi &&
|
||||||
|
k0 >= k4_lo && k0 <= k4_hi) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
const int iF = i0 + 1;
|
const int iF = i0 + 1;
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
@@ -193,7 +207,7 @@ void fdderivs(const int ex[3],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
|
if (has4) {
|
||||||
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
||||||
|
|||||||
@@ -1956,11 +1956,13 @@
|
|||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
||||||
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果
|
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
|
||||||
real*8 :: v1, v2, v3, v4, v5, v6
|
real*8 :: v1, v2, v3, v4, v5, v6
|
||||||
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max
|
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
|
||||||
|
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
|
||||||
|
logical :: need_full_symmetry
|
||||||
real*8 :: res_line
|
real*8 :: res_line
|
||||||
real*8 :: tmp_z_slab(extc(1), extc(2)) ! 分配在 k 循环外
|
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
||||||
write(*,*)"dim = ",wei
|
write(*,*)"dim = ",wei
|
||||||
@@ -2063,24 +2065,41 @@
|
|||||||
endif
|
endif
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
maxcx = maxval(cix(imino:imaxo))
|
ic_min = minval(cix(imino:imaxo))
|
||||||
maxcy = maxval(ciy(jmino:jmaxo))
|
ic_max = maxval(cix(imino:imaxo))
|
||||||
maxcz = maxval(ciz(kmino:kmaxo))
|
jc_min = minval(ciy(jmino:jmaxo))
|
||||||
|
jc_max = maxval(ciy(jmino:jmaxo))
|
||||||
|
kc_min = minval(ciz(kmino:kmaxo))
|
||||||
|
kc_max = maxval(ciz(kmino:kmaxo))
|
||||||
|
|
||||||
|
maxcx = ic_max
|
||||||
|
maxcy = jc_max
|
||||||
|
maxcz = kc_max
|
||||||
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
|
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
|
||||||
write(*,*)"error in prolong"
|
write(*,*)"error in prolong"
|
||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
i_lo = ic_min - 2
|
||||||
|
i_hi = ic_max + 3
|
||||||
|
j_lo = jc_min - 2
|
||||||
|
j_hi = jc_max + 3
|
||||||
|
k_lo = kc_min - 2
|
||||||
|
k_hi = kc_max + 3
|
||||||
|
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
|
||||||
|
if(need_full_symmetry)then
|
||||||
|
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||||
|
else
|
||||||
|
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
|
||||||
|
endif
|
||||||
|
|
||||||
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
||||||
jc_min = minval(ciy(jmino:jmaxo))
|
|
||||||
jc_max = maxval(ciy(jmino:jmaxo))
|
|
||||||
|
|
||||||
do k = kmino, kmaxo
|
do k = kmino, kmaxo
|
||||||
pz = piz(k); kc = ciz(k)
|
pz = piz(k); kc = ciz(k)
|
||||||
! --- Pass 1: Z 方向,只算一次 ---
|
! --- Pass 1: Z 方向,只算一次 ---
|
||||||
do iy = jc_min-3, jc_max+3 ! 仅需的 iy 范围
|
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3)
|
||||||
do ii = imini-3, imaxi+3 ! 仅需的 ii 范围
|
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3)
|
||||||
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
||||||
end do
|
end do
|
||||||
end do
|
end do
|
||||||
@@ -2088,7 +2107,7 @@ do k = kmino, kmaxo
|
|||||||
do j = jmino, jmaxo
|
do j = jmino, jmaxo
|
||||||
py = piy(j); jc = ciy(j)
|
py = piy(j); jc = ciy(j)
|
||||||
! --- Pass 2: Y 方向 ---
|
! --- Pass 2: Y 方向 ---
|
||||||
do ii = imini-3, imaxi+3
|
do ii = ic_min-2, ic_max+3
|
||||||
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
||||||
end do
|
end do
|
||||||
! --- Pass 3: X 方向 ---
|
! --- Pass 3: X 方向 ---
|
||||||
@@ -2351,9 +2370,12 @@ end do
|
|||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
|
|
||||||
real*8 :: tmp_xz_plane(extf(1), 6)
|
real*8 :: tmp_xz_plane(-1:extf(1), 6)
|
||||||
real*8 :: tmp_x_line(extf(1))
|
real*8 :: tmp_x_line(-1:extf(1))
|
||||||
integer :: fi, fj, fk, ii, jj, kk
|
integer :: fi, fj, fk, ii, jj, kk
|
||||||
|
integer :: fi_min, fi_max, ii_lo, ii_hi
|
||||||
|
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
|
||||||
|
logical :: need_full_symmetry
|
||||||
|
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
||||||
@@ -2433,7 +2455,34 @@ end do
|
|||||||
stop
|
stop
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call symmetry_bd(2,extf,funf,funff,SoA)
|
! 仅计算 X 向最终写回所需的窗口:
|
||||||
|
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
|
||||||
|
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||||
|
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||||
|
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
|
||||||
|
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
|
||||||
|
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
|
||||||
|
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
|
||||||
|
ii_lo = fi_min - 2
|
||||||
|
ii_hi = fi_max + 3
|
||||||
|
jj_lo = fj_min - 2
|
||||||
|
jj_hi = fj_max + 3
|
||||||
|
kk_lo = fk_min - 2
|
||||||
|
kk_hi = fk_max + 3
|
||||||
|
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
|
||||||
|
jj_lo < -1 .or. jj_hi > extf(2) .or. &
|
||||||
|
kk_lo < -1 .or. kk_hi > extf(3))then
|
||||||
|
write(*,*)"restrict3: invalid stencil window"
|
||||||
|
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
|
||||||
|
write(*,*)"extf=",extf
|
||||||
|
stop
|
||||||
|
endif
|
||||||
|
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
|
||||||
|
if(need_full_symmetry)then
|
||||||
|
call symmetry_bd(2,extf,funf,funff,SoA)
|
||||||
|
else
|
||||||
|
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
|
||||||
|
endif
|
||||||
|
|
||||||
!~~~~~~> restriction start...
|
!~~~~~~> restriction start...
|
||||||
do k = kmino, kmaxo
|
do k = kmino, kmaxo
|
||||||
@@ -2445,7 +2494,7 @@ do k = kmino, kmaxo
|
|||||||
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
||||||
! 确保 ii 循环是最内层且连续访问
|
! 确保 ii 循环是最内层且连续访问
|
||||||
!DIR$ VECTOR ALWAYS
|
!DIR$ VECTOR ALWAYS
|
||||||
do ii = 1, extf(1)
|
do ii = ii_lo, ii_hi
|
||||||
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
||||||
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
||||||
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
||||||
@@ -2470,7 +2519,7 @@ do k = kmino, kmaxo
|
|||||||
|
|
||||||
! 优化点 2: 同样向量化 Y 方向压缩
|
! 优化点 2: 同样向量化 Y 方向压缩
|
||||||
!DIR$ VECTOR ALWAYS
|
!DIR$ VECTOR ALWAYS
|
||||||
do ii = 1, extf(1)
|
do ii = ii_lo, ii_hi
|
||||||
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
||||||
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
||||||
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
||||||
|
|||||||
Reference in New Issue
Block a user