Compare commits
31 Commits
chb-rebase
...
yx-prolong
| Author | SHA1 | Date | |
|---|---|---|---|
| 12e1f63d50 | |||
| 47f91ff46f | |||
|
|
672b7ebee2 | ||
|
|
63bf180159 | ||
| 524d1d1512 | |||
| 44efb2e08c | |||
| 16013081e0 | |||
| 03416a7b28 | |||
| cca3c16c2b | |||
| e5231849ee | |||
| a766e49ff0 | |||
| 1a518cd3f6 | |||
| 1dc622e516 | |||
| 3046a0ccde | |||
| d4ec69c98a | |||
| 2c0a3055d4 | |||
| 1eba73acbe | |||
| b91cfff301 | |||
| e29ca2dca9 | |||
| 6493101ca0 | |||
| 169986cde1 | |||
| 1fbc213888 | |||
| 6024708a48 | |||
| bc457d981e | |||
| 51dead090e | |||
| 34d6922a66 | |||
| 8010ad27ed | |||
| 38e691f013 | |||
| 808387aa11 | |||
| c2b676abf2 | |||
| 2c60533501 |
@@ -7,7 +7,6 @@
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
#include <new>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
#include "misc.h"
|
||||
@@ -18,168 +17,6 @@ using namespace std;
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
|
||||
namespace
|
||||
{
|
||||
struct InterpBlockView
|
||||
{
|
||||
Block *bp;
|
||||
double llb[dim];
|
||||
double uub[dim];
|
||||
};
|
||||
|
||||
struct BlockBinIndex
|
||||
{
|
||||
int bins[dim];
|
||||
double lo[dim];
|
||||
double inv[dim];
|
||||
vector<InterpBlockView> views;
|
||||
vector<vector<int>> bin_to_blocks;
|
||||
bool valid;
|
||||
|
||||
BlockBinIndex() : valid(false)
|
||||
{
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
bins[i] = 1;
|
||||
lo[i] = 0.0;
|
||||
inv[i] = 0.0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline int clamp_int(int v, int lo, int hi)
|
||||
{
|
||||
return (v < lo) ? lo : ((v > hi) ? hi : v);
|
||||
}
|
||||
|
||||
inline int coord_to_bin(double x, double lo, double inv, int nb)
|
||||
{
|
||||
if (nb <= 1 || inv <= 0.0)
|
||||
return 0;
|
||||
int b = int(floor((x - lo) * inv));
|
||||
return clamp_int(b, 0, nb - 1);
|
||||
}
|
||||
|
||||
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
|
||||
{
|
||||
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
|
||||
}
|
||||
|
||||
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
|
||||
{
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
|
||||
{
|
||||
index = BlockBinIndex();
|
||||
|
||||
MyList<Block> *Bp = patch->blb;
|
||||
while (Bp)
|
||||
{
|
||||
Block *BP = Bp->data;
|
||||
InterpBlockView view;
|
||||
view.bp = BP;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
index.views.push_back(view);
|
||||
if (Bp == patch->ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
|
||||
const int nblocks = int(index.views.size());
|
||||
if (nblocks <= 0)
|
||||
return;
|
||||
|
||||
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
|
||||
bins_1d = clamp_int(bins_1d, 1, 32);
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
index.bins[i] = bins_1d;
|
||||
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
|
||||
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
|
||||
if (hi > index.lo[i] && bins_1d > 1)
|
||||
index.inv[i] = bins_1d / (hi - index.lo[i]);
|
||||
else
|
||||
index.inv[i] = 0.0;
|
||||
}
|
||||
|
||||
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
|
||||
|
||||
for (int bi = 0; bi < nblocks; bi++)
|
||||
{
|
||||
const InterpBlockView &view = index.views[bi];
|
||||
int bmin[dim], bmax[dim];
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
const double low = view.llb[d] - DH[d] / 2;
|
||||
const double up = view.uub[d] + DH[d] / 2;
|
||||
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
|
||||
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
|
||||
if (bmax[d] < bmin[d])
|
||||
{
|
||||
int t = bmin[d];
|
||||
bmin[d] = bmax[d];
|
||||
bmax[d] = t;
|
||||
}
|
||||
}
|
||||
|
||||
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
|
||||
for (int by = bmin[1]; by <= bmax[1]; by++)
|
||||
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
|
||||
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
|
||||
}
|
||||
|
||||
index.valid = true;
|
||||
}
|
||||
|
||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||
{
|
||||
if (!index.valid)
|
||||
return -1;
|
||||
|
||||
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
|
||||
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
|
||||
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
|
||||
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
|
||||
|
||||
for (size_t ci = 0; ci < cand.size(); ci++)
|
||||
{
|
||||
const int bi = cand[ci];
|
||||
if (point_in_block_view(index.views[bi], pox, DH))
|
||||
return bi;
|
||||
}
|
||||
|
||||
// Fallback to full scan for numerical edge cases around bin boundaries.
|
||||
for (size_t bi = 0; bi < index.views.size(); bi++)
|
||||
if (point_in_block_view(index.views[bi], pox, DH))
|
||||
return int(bi);
|
||||
|
||||
return -1;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||
{
|
||||
|
||||
@@ -530,11 +367,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
|
||||
double DH[dim];
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
DH[i] = getdX(i);
|
||||
BlockBinIndex block_index;
|
||||
build_block_bin_index(this, DH, block_index);
|
||||
|
||||
for (int j = 0; j < NN; j++) // run along points
|
||||
{
|
||||
@@ -557,24 +392,57 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
MyList<Block> *Bp = blb;
|
||||
bool notfind = true;
|
||||
while (notfind && Bp) // run along Blocks
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
Block *BP = Bp->data;
|
||||
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -667,11 +535,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
|
||||
double DH[dim];
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
DH[i] = getdX(i);
|
||||
BlockBinIndex block_index;
|
||||
build_block_bin_index(this, DH, block_index);
|
||||
|
||||
// --- Interpolation phase (identical to original) ---
|
||||
for (int j = 0; j < NN; j++)
|
||||
@@ -695,23 +561,56 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
MyList<Block> *Bp = blb;
|
||||
bool notfind = true;
|
||||
while (notfind && Bp)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
Block *BP = Bp->data;
|
||||
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -934,11 +833,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
||||
MPI_Comm_group(Comm_here, &local_group);
|
||||
|
||||
double DH[dim];
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
DH[i] = getdX(i);
|
||||
BlockBinIndex block_index;
|
||||
build_block_bin_index(this, DH, block_index);
|
||||
|
||||
for (int j = 0; j < NN; j++) // run along points
|
||||
{
|
||||
@@ -961,24 +858,57 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
MyList<Block> *Bp = blb;
|
||||
bool notfind = true;
|
||||
while (notfind && Bp) // run along Blocks
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
Block *BP = Bp->data;
|
||||
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3883,263 +3883,175 @@ int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyLis
|
||||
return size_out;
|
||||
}
|
||||
//
|
||||
void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
int Symmetry)
|
||||
{
|
||||
int myrank, cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
int node;
|
||||
|
||||
MPI_Request *reqs = new MPI_Request[2 * cpusize];
|
||||
MPI_Status *stats = new MPI_Status[2 * cpusize];
|
||||
int *req_node = new int[2 * cpusize];
|
||||
int *req_is_recv = new int[2 * cpusize];
|
||||
int *completed = new int[2 * cpusize];
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
|
||||
double **send_data = new double *[cpusize];
|
||||
double **rec_data = new double *[cpusize];
|
||||
int *send_lengths = new int[cpusize];
|
||||
int *recv_lengths = new int[cpusize];
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
send_data[node] = rec_data[node] = 0;
|
||||
send_lengths[node] = recv_lengths[node] = 0;
|
||||
}
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[node] > 0)
|
||||
{
|
||||
rec_data[node] = new double[recv_lengths[node]];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
pending_recv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[myrank] > 0)
|
||||
{
|
||||
rec_data[myrank] = new double[recv_lengths[myrank]];
|
||||
if (!rec_data[myrank])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// Pack and post sends.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
if (send_lengths[node] > 0)
|
||||
{
|
||||
send_data[node] = new double[send_lengths[node]];
|
||||
if (!send_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
{
|
||||
int idx = completed[i];
|
||||
if (idx >= 0 && req_is_recv[idx])
|
||||
{
|
||||
int recv_node = req_node[idx];
|
||||
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
pending_recv--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||
|
||||
if (rec_data[myrank])
|
||||
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (send_data[node])
|
||||
delete[] send_data[node];
|
||||
if (rec_data[node])
|
||||
delete[] rec_data[node];
|
||||
}
|
||||
|
||||
delete[] reqs;
|
||||
delete[] stats;
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
int Symmetry)
|
||||
{
|
||||
int myrank, cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
int node;
|
||||
|
||||
MPI_Request *reqs;
|
||||
MPI_Status *stats;
|
||||
reqs = new MPI_Request[2 * cpusize];
|
||||
stats = new MPI_Status[2 * cpusize];
|
||||
int req_no = 0;
|
||||
|
||||
double **send_data, **rec_data;
|
||||
send_data = new double *[cpusize];
|
||||
rec_data = new double *[cpusize];
|
||||
int length;
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
send_data[node] = rec_data[node] = 0;
|
||||
if (node == myrank)
|
||||
{
|
||||
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
rec_data[node] = new double[length];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// send from this cpu to cpu#node
|
||||
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
send_data[node] = new double[length];
|
||||
if (!send_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
||||
}
|
||||
// receive from cpu#node to this cpu
|
||||
if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
rec_data[node] = new double[length];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
// wait for all requests to complete
|
||||
MPI_Waitall(req_no, reqs, stats);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
if (rec_data[node])
|
||||
data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (send_data[node])
|
||||
delete[] send_data[node];
|
||||
if (rec_data[node])
|
||||
delete[] rec_data[node];
|
||||
}
|
||||
|
||||
delete[] reqs;
|
||||
delete[] stats;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
}
|
||||
//
|
||||
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
int Symmetry)
|
||||
{
|
||||
int myrank, cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
int node;
|
||||
|
||||
MPI_Request *reqs = new MPI_Request[2 * cpusize];
|
||||
MPI_Status *stats = new MPI_Status[2 * cpusize];
|
||||
int *req_node = new int[2 * cpusize];
|
||||
int *req_is_recv = new int[2 * cpusize];
|
||||
int *completed = new int[2 * cpusize];
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
|
||||
double **send_data = new double *[cpusize];
|
||||
double **rec_data = new double *[cpusize];
|
||||
int *send_lengths = new int[cpusize];
|
||||
int *recv_lengths = new int[cpusize];
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
send_data[node] = rec_data[node] = 0;
|
||||
send_lengths[node] = recv_lengths[node] = 0;
|
||||
}
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[node] > 0)
|
||||
{
|
||||
rec_data[node] = new double[recv_lengths[node]];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
pending_recv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[myrank] > 0)
|
||||
{
|
||||
rec_data[myrank] = new double[recv_lengths[myrank]];
|
||||
if (!rec_data[myrank])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// Pack and post sends.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
if (send_lengths[node] > 0)
|
||||
{
|
||||
send_data[node] = new double[send_lengths[node]];
|
||||
if (!send_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
{
|
||||
int idx = completed[i];
|
||||
if (idx >= 0 && req_is_recv[idx])
|
||||
{
|
||||
int recv_node = req_node[idx];
|
||||
data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
pending_recv--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||
|
||||
if (rec_data[myrank])
|
||||
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (send_data[node])
|
||||
delete[] send_data[node];
|
||||
if (rec_data[node])
|
||||
delete[] rec_data[node];
|
||||
}
|
||||
|
||||
delete[] reqs;
|
||||
delete[] stats;
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
int Symmetry)
|
||||
{
|
||||
int myrank, cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
int node;
|
||||
|
||||
MPI_Request *reqs;
|
||||
MPI_Status *stats;
|
||||
reqs = new MPI_Request[2 * cpusize];
|
||||
stats = new MPI_Status[2 * cpusize];
|
||||
int req_no = 0;
|
||||
|
||||
double **send_data, **rec_data;
|
||||
send_data = new double *[cpusize];
|
||||
rec_data = new double *[cpusize];
|
||||
int length;
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
send_data[node] = rec_data[node] = 0;
|
||||
if (node == myrank)
|
||||
{
|
||||
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
rec_data[node] = new double[length];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// send from this cpu to cpu#node
|
||||
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
send_data[node] = new double[length];
|
||||
if (!send_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 2" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
||||
}
|
||||
// receive from cpu#node to this cpu
|
||||
if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
|
||||
{
|
||||
rec_data[node] = new double[length];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
// wait for all requests to complete
|
||||
MPI_Waitall(req_no, reqs, stats);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
if (rec_data[node])
|
||||
data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (send_data[node])
|
||||
delete[] send_data[node];
|
||||
if (rec_data[node])
|
||||
delete[] rec_data[node];
|
||||
}
|
||||
|
||||
delete[] reqs;
|
||||
delete[] stats;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
}
|
||||
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
int cpusize;
|
||||
@@ -4367,110 +4279,73 @@ void Parallel::SyncCache::destroy()
|
||||
cpusize = 0; max_reqs = 0;
|
||||
}
|
||||
// transfer_cached: reuse pre-allocated buffers from SyncCache
|
||||
void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache)
|
||||
{
|
||||
int myrank;
|
||||
void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache)
|
||||
{
|
||||
int myrank;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int node;
|
||||
int *req_node = new int[cache.max_reqs];
|
||||
int *req_is_recv = new int[cache.max_reqs];
|
||||
int *completed = new int[cache.max_reqs];
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = rlength;
|
||||
if (rlength > 0)
|
||||
{
|
||||
if (rlength > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
pending_recv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[myrank] = self_len;
|
||||
if (self_len > 0)
|
||||
{
|
||||
if (self_len > cache.recv_buf_caps[myrank])
|
||||
{
|
||||
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
||||
cache.recv_bufs[myrank] = new double[self_len];
|
||||
cache.recv_buf_caps[myrank] = self_len;
|
||||
}
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// Pack and post sends.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.send_lengths[node] = slength;
|
||||
if (slength > 0)
|
||||
{
|
||||
if (slength > cache.send_buf_caps[node])
|
||||
{
|
||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||
cache.send_bufs[node] = new double[slength];
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
{
|
||||
int idx = completed[i];
|
||||
if (idx >= 0 && req_is_recv[idx])
|
||||
{
|
||||
int recv_node_i = req_node[idx];
|
||||
data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
|
||||
pending_recv--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
|
||||
if (self_len > 0)
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
}
|
||||
int req_no = 0;
|
||||
int node;
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank)
|
||||
{
|
||||
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = length;
|
||||
if (length > 0)
|
||||
{
|
||||
if (length > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[length];
|
||||
cache.recv_buf_caps[node] = length;
|
||||
}
|
||||
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// send
|
||||
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.send_lengths[node] = slength;
|
||||
if (slength > 0)
|
||||
{
|
||||
if (slength > cache.send_buf_caps[node])
|
||||
{
|
||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||
cache.send_bufs[node] = new double[slength];
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
||||
}
|
||||
// recv
|
||||
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = rlength;
|
||||
if (rlength > 0)
|
||||
{
|
||||
if (rlength > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
|
||||
for (node = 0; node < cpusize; node++)
|
||||
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
|
||||
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
|
||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||
{
|
||||
@@ -5883,9 +5758,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
}
|
||||
|
||||
// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
|
||||
void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache)
|
||||
void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache)
|
||||
{
|
||||
if (!cache.valid)
|
||||
{
|
||||
@@ -5931,100 +5806,60 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int *req_node = new int[cache.max_reqs];
|
||||
int *req_is_recv = new int[cache.max_reqs];
|
||||
int *completed = new int[cache.max_reqs];
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = rlength;
|
||||
if (rlength > 0)
|
||||
{
|
||||
if (rlength > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
pending_recv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[myrank] = self_len;
|
||||
if (self_len > 0)
|
||||
{
|
||||
if (self_len > cache.recv_buf_caps[myrank])
|
||||
{
|
||||
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
||||
cache.recv_bufs[myrank] = new double[self_len];
|
||||
cache.recv_buf_caps[myrank] = self_len;
|
||||
}
|
||||
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// Pack and post sends.
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.send_lengths[node] = slength;
|
||||
if (slength > 0)
|
||||
{
|
||||
if (slength > cache.send_buf_caps[node])
|
||||
{
|
||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||
cache.send_bufs[node] = new double[slength];
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
{
|
||||
int idx = completed[i];
|
||||
if (idx >= 0 && req_is_recv[idx])
|
||||
{
|
||||
int recv_node_i = req_node[idx];
|
||||
data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
|
||||
pending_recv--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
|
||||
if (self_len > 0)
|
||||
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
}
|
||||
int req_no = 0;
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank)
|
||||
{
|
||||
int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = length;
|
||||
if (length > 0)
|
||||
{
|
||||
if (length > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[length];
|
||||
cache.recv_buf_caps[node] = length;
|
||||
}
|
||||
data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
cache.send_lengths[node] = slength;
|
||||
if (slength > 0)
|
||||
{
|
||||
if (slength > cache.send_buf_caps[node])
|
||||
{
|
||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||
cache.send_bufs[node] = new double[slength];
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
||||
}
|
||||
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
cache.recv_lengths[node] = rlength;
|
||||
if (rlength > 0)
|
||||
{
|
||||
if (rlength > cache.recv_buf_caps[node])
|
||||
{
|
||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
|
||||
data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// collect all buffer grid segments or blocks for given patch
|
||||
MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
|
||||
|
||||
@@ -39,7 +39,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
// printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);
|
||||
|
||||
// temp variable
|
||||
double gxx[all],gyy[all],gzz[all];
|
||||
double chix[all],chiy[all],chiz[all];
|
||||
double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all];
|
||||
double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all];
|
||||
@@ -51,9 +50,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
double Gamxx[all],Gamxy[all],Gamxz[all];
|
||||
double Gamyx[all],Gamyy[all],Gamyz[all];
|
||||
double Gamzx[all],Gamzy[all],Gamzz[all];
|
||||
double Kx[all], Ky[all], Kz[all], div_beta[all], S[all];
|
||||
double Kx[all], Ky[all], Kz[all], S[all];
|
||||
double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all];
|
||||
double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all];
|
||||
double alpn1[all], chin1[all];
|
||||
double gupxx[all], gupxy[all], gupxz[all];
|
||||
double gupyy[all], gupyz[all], gupzz[all];
|
||||
double SSS[3] = { 1.0, 1.0, 1.0};
|
||||
@@ -107,9 +106,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
for(int i=0;i<all;i+=1){
|
||||
alpn1[i] = Lap[i] + 1.0;
|
||||
chin1[i] = chi[i] + 1.0;
|
||||
gxx[i] = dxx[i] + 1.0;
|
||||
gyy[i] = dyy[i] + 1.0;
|
||||
gzz[i] = dzz[i] + 1.0;
|
||||
}
|
||||
// 9ms //
|
||||
fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
|
||||
@@ -127,231 +123,196 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
|
||||
// 3ms //
|
||||
for(int i=0;i<all;i+=1){
|
||||
div_beta[i] = betaxx[i] + betayy[i] + betazz[i];
|
||||
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]);
|
||||
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] +
|
||||
TWO * (gxx[i] * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
|
||||
gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * gyy[i] * div_beta[i] +
|
||||
TWO * (gxy[i] * betaxy[i] + gyy[i] * betayy[i] + gyz[i] * betazy[i]);
|
||||
gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * gzz[i] * div_beta[i] +
|
||||
TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + gzz[i] * betazz[i]);
|
||||
gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * div_beta[i] +
|
||||
gxx[i] * betaxy[i] + gxz[i] * betazy[i] + gyy[i] * betayx[i]
|
||||
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - divb);
|
||||
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * (dxx[i] + ONE) * divb +
|
||||
TWO * ((dxx[i] + ONE) * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
|
||||
gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * (dyy[i] + ONE) * divb +
|
||||
TWO * (gxy[i] * betaxy[i] + (dyy[i] + ONE) * betayy[i] + gyz[i] * betazy[i]);
|
||||
gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * (dzz[i] + ONE) * divb +
|
||||
TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + (dzz[i] + ONE) * betazz[i]);
|
||||
gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * divb +
|
||||
(dxx[i] + ONE) * betaxy[i] + gxz[i] * betazy[i] + (dyy[i] + ONE) * betayx[i]
|
||||
+ gyz[i] * betazx[i] - gxy[i] * betazz[i];
|
||||
gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * div_beta[i] +
|
||||
gxy[i] * betaxz[i] + gyy[i] * betayz[i] + gxz[i] * betaxy[i]
|
||||
+ gzz[i] * betazy[i] - gyz[i] * betaxx[i];
|
||||
gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * div_beta[i] +
|
||||
gxx[i] * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
|
||||
+ gzz[i] * betazx[i] - gxz[i] * betayy[i];
|
||||
gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * divb +
|
||||
gxy[i] * betaxz[i] + (dyy[i] + ONE) * betayz[i] + gxz[i] * betaxy[i]
|
||||
+ (dzz[i] + ONE) * betazy[i] - gyz[i] * betaxx[i];
|
||||
gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * divb +
|
||||
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
|
||||
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
|
||||
}
|
||||
// 1ms //
|
||||
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
|
||||
for(int i=0;i<all;i+=1){
|
||||
double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
|
||||
gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i];
|
||||
gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det;
|
||||
gupxy[i] = -(gxy[i] * gzz[i] - gyz[i] * gxz[i]) / det;
|
||||
gupxz[i] = (gxy[i] * gyz[i] - gyy[i] * gxz[i]) / det;
|
||||
gupyy[i] = (gxx[i] * gzz[i] - gxz[i] * gxz[i]) / det;
|
||||
gupyz[i] = -(gxx[i] * gyz[i] - gxy[i] * gxz[i]) / det;
|
||||
gupzz[i] = (gxx[i] * gyy[i] - gxy[i] * gxy[i]) / det;
|
||||
}
|
||||
// 2.2ms //
|
||||
if(co==0){
|
||||
for (int i=0;i<all;i+=1) {
|
||||
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
|
||||
gxz[i] * (dyy[i] + ONE) * gxz[i] - gxy[i] * gxy[i] * (dzz[i] + ONE) - (dxx[i] + ONE) * gyz[i] * gyz[i];
|
||||
double lg_xx = ((dyy[i] + ONE) * (dzz[i] + ONE) - gyz[i] * gyz[i]) / det;
|
||||
double lg_xy = -(gxy[i] * (dzz[i] + ONE) - gyz[i] * gxz[i]) / det;
|
||||
double lg_xz = (gxy[i] * gyz[i] - (dyy[i] + ONE) * gxz[i]) / det;
|
||||
double lg_yy = ((dxx[i] + ONE) * (dzz[i] + ONE) - gxz[i] * gxz[i]) / det;
|
||||
double lg_yz = -((dxx[i] + ONE) * gyz[i] - gxy[i] * gxz[i]) / det;
|
||||
double lg_zz = ((dxx[i] + ONE) * (dyy[i] + ONE) - gxy[i] * gxy[i]) / det;
|
||||
gupxx[i] = lg_xx; gupxy[i] = lg_xy; gupxz[i] = lg_xz;
|
||||
gupyy[i] = lg_yy; gupyz[i] = lg_yz; gupzz[i] = lg_zz;
|
||||
|
||||
if(co==0){
|
||||
Gmx_Res[i] = Gamx[i] - (
|
||||
gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) +
|
||||
gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) +
|
||||
gupxz[i] * (gupxx[i]*gxzx[i] + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i]) +
|
||||
|
||||
gupxx[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
|
||||
gupxy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
|
||||
gupxz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
|
||||
|
||||
gupxx[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
||||
gupxy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
||||
gupxz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
||||
lg_xx * (lg_xx*gxxx[i] + lg_xy*gxyx[i] + lg_xz*gxzx[i]) +
|
||||
lg_xy * (lg_xx*gxyx[i] + lg_xy*gyyx[i] + lg_xz*gyzx[i]) +
|
||||
lg_xz * (lg_xx*gxzx[i] + lg_xy*gyzx[i] + lg_xz*gzzx[i]) +
|
||||
lg_xx * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
|
||||
lg_xy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
|
||||
lg_xz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
|
||||
lg_xx * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||
lg_xy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||
lg_xz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||
);
|
||||
|
||||
Gmy_Res[i] = Gamy[i] - (
|
||||
gupxx[i] * (gupxy[i]*gxxx[i] + gupyy[i]*gxyx[i] + gupyz[i]*gxzx[i]) +
|
||||
gupxy[i] * (gupxy[i]*gxyx[i] + gupyy[i]*gyyx[i] + gupyz[i]*gyzx[i]) +
|
||||
gupxz[i] * (gupxy[i]*gxzx[i] + gupyy[i]*gyzx[i] + gupyz[i]*gzzx[i]) +
|
||||
|
||||
gupxy[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
|
||||
gupyy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
|
||||
gupyz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
|
||||
|
||||
gupxy[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
||||
gupyy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
||||
gupyz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
||||
lg_xx * (lg_xy*gxxx[i] + lg_yy*gxyx[i] + lg_yz*gxzx[i]) +
|
||||
lg_xy * (lg_xy*gxyx[i] + lg_yy*gyyx[i] + lg_yz*gyzx[i]) +
|
||||
lg_xz * (lg_xy*gxzx[i] + lg_yy*gyzx[i] + lg_yz*gzzx[i]) +
|
||||
lg_xy * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
|
||||
lg_yy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
|
||||
lg_yz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
|
||||
lg_xy * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||
lg_yy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||
lg_yz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||
);
|
||||
|
||||
Gmz_Res[i] = Gamz[i] - (
|
||||
gupxx[i] * (gupxz[i]*gxxx[i] + gupyz[i]*gxyx[i] + gupzz[i]*gxzx[i]) +
|
||||
gupxy[i] * (gupxz[i]*gxyx[i] + gupyz[i]*gyyx[i] + gupzz[i]*gyzx[i]) +
|
||||
gupxz[i] * (gupxz[i]*gxzx[i] + gupyz[i]*gyzx[i] + gupzz[i]*gzzx[i]) +
|
||||
|
||||
gupxy[i] * (gupxz[i]*gxxy[i] + gupyz[i]*gxyy[i] + gupzz[i]*gxzy[i]) +
|
||||
gupyy[i] * (gupxz[i]*gxyy[i] + gupyz[i]*gyyy[i] + gupzz[i]*gyzy[i]) +
|
||||
gupyz[i] * (gupxz[i]*gxzy[i] + gupyz[i]*gyzy[i] + gupzz[i]*gzzy[i]) +
|
||||
|
||||
gupxz[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
||||
gupyz[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
||||
gupzz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
||||
lg_xx * (lg_xz*gxxx[i] + lg_yz*gxyx[i] + lg_zz*gxzx[i]) +
|
||||
lg_xy * (lg_xz*gxyx[i] + lg_yz*gyyx[i] + lg_zz*gyzx[i]) +
|
||||
lg_xz * (lg_xz*gxzx[i] + lg_yz*gyzx[i] + lg_zz*gzzx[i]) +
|
||||
lg_xy * (lg_xz*gxxy[i] + lg_yz*gxyy[i] + lg_zz*gxzy[i]) +
|
||||
lg_yy * (lg_xz*gxyy[i] + lg_yz*gyyy[i] + lg_zz*gyzy[i]) +
|
||||
lg_yz * (lg_xz*gxzy[i] + lg_yz*gyzy[i] + lg_zz*gzzy[i]) +
|
||||
lg_xz * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||
lg_yz * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||
lg_zz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||
);
|
||||
}
|
||||
|
||||
Gamxxx[i] = HALF * ( lg_xx*gxxx[i]
|
||||
+ lg_xy*(TWO*gxyx[i] - gxxy[i])
|
||||
+ lg_xz*(TWO*gxzx[i] - gxxz[i]) );
|
||||
Gamyxx[i] = HALF * ( lg_xy*gxxx[i]
|
||||
+ lg_yy*(TWO*gxyx[i] - gxxy[i])
|
||||
+ lg_yz*(TWO*gxzx[i] - gxxz[i]) );
|
||||
Gamzxx[i] = HALF * ( lg_xz*gxxx[i]
|
||||
+ lg_yz*(TWO*gxyx[i] - gxxy[i])
|
||||
+ lg_zz*(TWO*gxzx[i] - gxxz[i]) );
|
||||
Gamxyy[i] = HALF * ( lg_xx*(TWO*gxyy[i] - gyyx[i])
|
||||
+ lg_xy*gyyy[i]
|
||||
+ lg_xz*(TWO*gyzy[i] - gyyz[i]) );
|
||||
Gamyyy[i] = HALF * ( lg_xy*(TWO*gxyy[i] - gyyx[i])
|
||||
+ lg_yy*gyyy[i]
|
||||
+ lg_yz*(TWO*gyzy[i] - gyyz[i]) );
|
||||
Gamzyy[i] = HALF * ( lg_xz*(TWO*gxyy[i] - gyyx[i])
|
||||
+ lg_yz*gyyy[i]
|
||||
+ lg_zz*(TWO*gyzy[i] - gyyz[i]) );
|
||||
Gamxzz[i] = HALF * ( lg_xx*(TWO*gxzz[i] - gzzx[i])
|
||||
+ lg_xy*(TWO*gyzz[i] - gzzy[i])
|
||||
+ lg_xz*gzzz[i] );
|
||||
Gamyzz[i] = HALF * ( lg_xy*(TWO*gxzz[i] - gzzx[i])
|
||||
+ lg_yy*(TWO*gyzz[i] - gzzy[i])
|
||||
+ lg_yz*gzzz[i] );
|
||||
Gamzzz[i] = HALF * ( lg_xz*(TWO*gxzz[i] - gzzx[i])
|
||||
+ lg_yz*(TWO*gyzz[i] - gzzy[i])
|
||||
+ lg_zz*gzzz[i] );
|
||||
Gamxxy[i] = HALF * ( lg_xx*gxxy[i]
|
||||
+ lg_xy*gyyx[i]
|
||||
+ lg_xz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
Gamyxy[i] = HALF * ( lg_xy*gxxy[i]
|
||||
+ lg_yy*gyyx[i]
|
||||
+ lg_yz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
Gamzxy[i] = HALF * ( lg_xz*gxxy[i]
|
||||
+ lg_yz*gyyx[i]
|
||||
+ lg_zz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
Gamxxz[i] = HALF * ( lg_xx*gxxz[i]
|
||||
+ lg_xy*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ lg_xz*gzzx[i] );
|
||||
Gamyxz[i] = HALF * ( lg_xy*gxxz[i]
|
||||
+ lg_yy*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ lg_yz*gzzx[i] );
|
||||
Gamzxz[i] = HALF * ( lg_xz*gxxz[i]
|
||||
+ lg_yz*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ lg_zz*gzzx[i] );
|
||||
Gamxyz[i] = HALF * ( lg_xx*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ lg_xy*gyyz[i]
|
||||
+ lg_xz*gzzy[i] );
|
||||
Gamyyz[i] = HALF * ( lg_xy*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ lg_yy*gyyz[i]
|
||||
+ lg_yz*gzzy[i] );
|
||||
Gamzyz[i] = HALF * ( lg_xz*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ lg_yz*gyyz[i]
|
||||
+ lg_zz*gzzy[i] );
|
||||
}
|
||||
// 5ms //
|
||||
// Fused: A^{ij} raise-index + Gamma_rhs part 1 (2 loops -> 1)
|
||||
for (int i=0;i<all;i+=1) {
|
||||
|
||||
Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i]
|
||||
+ gupxy[i]*(TWO*gxyx[i] - gxxy[i])
|
||||
+ gupxz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
||||
|
||||
Gamyxx[i] = HALF * ( gupxy[i]*gxxx[i]
|
||||
+ gupyy[i]*(TWO*gxyx[i] - gxxy[i])
|
||||
+ gupyz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
||||
|
||||
Gamzxx[i] = HALF * ( gupxz[i]*gxxx[i]
|
||||
+ gupyz[i]*(TWO*gxyx[i] - gxxy[i])
|
||||
+ gupzz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
||||
|
||||
Gamxyy[i] = HALF * ( gupxx[i]*(TWO*gxyy[i] - gyyx[i])
|
||||
+ gupxy[i]*gyyy[i]
|
||||
+ gupxz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
||||
|
||||
Gamyyy[i] = HALF * ( gupxy[i]*(TWO*gxyy[i] - gyyx[i])
|
||||
+ gupyy[i]*gyyy[i]
|
||||
+ gupyz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
||||
|
||||
Gamzyy[i] = HALF * ( gupxz[i]*(TWO*gxyy[i] - gyyx[i])
|
||||
+ gupyz[i]*gyyy[i]
|
||||
+ gupzz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
||||
|
||||
Gamxzz[i] = HALF * ( gupxx[i]*(TWO*gxzz[i] - gzzx[i])
|
||||
+ gupxy[i]*(TWO*gyzz[i] - gzzy[i])
|
||||
+ gupxz[i]*gzzz[i] );
|
||||
|
||||
Gamyzz[i] = HALF * ( gupxy[i]*(TWO*gxzz[i] - gzzx[i])
|
||||
+ gupyy[i]*(TWO*gyzz[i] - gzzy[i])
|
||||
+ gupyz[i]*gzzz[i] );
|
||||
|
||||
Gamzzz[i] = HALF * ( gupxz[i]*(TWO*gxzz[i] - gzzx[i])
|
||||
+ gupyz[i]*(TWO*gyzz[i] - gzzy[i])
|
||||
+ gupzz[i]*gzzz[i] );
|
||||
|
||||
Gamxxy[i] = HALF * ( gupxx[i]*gxxy[i]
|
||||
+ gupxy[i]*gyyx[i]
|
||||
+ gupxz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
|
||||
Gamyxy[i] = HALF * ( gupxy[i]*gxxy[i]
|
||||
+ gupyy[i]*gyyx[i]
|
||||
+ gupyz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
|
||||
Gamzxy[i] = HALF * ( gupxz[i]*gxxy[i]
|
||||
+ gupyz[i]*gyyx[i]
|
||||
+ gupzz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||
|
||||
Gamxxz[i] = HALF * ( gupxx[i]*gxxz[i]
|
||||
+ gupxy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ gupxz[i]*gzzx[i] );
|
||||
|
||||
Gamyxz[i] = HALF * ( gupxy[i]*gxxz[i]
|
||||
+ gupyy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ gupyz[i]*gzzx[i] );
|
||||
|
||||
Gamzxz[i] = HALF * ( gupxz[i]*gxxz[i]
|
||||
+ gupyz[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||
+ gupzz[i]*gzzx[i] );
|
||||
|
||||
Gamxyz[i] = HALF * ( gupxx[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ gupxy[i]*gyyz[i]
|
||||
+ gupxz[i]*gzzy[i] );
|
||||
|
||||
Gamyyz[i] = HALF * ( gupxy[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ gupyy[i]*gyyz[i]
|
||||
+ gupyz[i]*gzzy[i] );
|
||||
|
||||
Gamzyz[i] = HALF * ( gupxz[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||
+ gupyz[i]*gyyz[i]
|
||||
+ gupzz[i]*gzzy[i] );
|
||||
|
||||
}
|
||||
// 1.8ms //
|
||||
for (int i=0;i<all;i+=1) {
|
||||
|
||||
Rxx[i] = gupxx[i]*gupxx[i]*Axx[i]
|
||||
double axx = gupxx[i]*gupxx[i]*Axx[i]
|
||||
+ gupxy[i]*gupxy[i]*Ayy[i]
|
||||
+ gupxz[i]*gupxz[i]*Azz[i]
|
||||
+ TWO * ( gupxx[i]*gupxy[i]*Axy[i]
|
||||
+ gupxx[i]*gupxz[i]*Axz[i]
|
||||
+ gupxy[i]*gupxz[i]*Ayz[i] );
|
||||
|
||||
Ryy[i] = gupxy[i]*gupxy[i]*Axx[i]
|
||||
double ayy = gupxy[i]*gupxy[i]*Axx[i]
|
||||
+ gupyy[i]*gupyy[i]*Ayy[i]
|
||||
+ gupyz[i]*gupyz[i]*Azz[i]
|
||||
+ TWO * ( gupxy[i]*gupyy[i]*Axy[i]
|
||||
+ gupxy[i]*gupyz[i]*Axz[i]
|
||||
+ gupyy[i]*gupyz[i]*Ayz[i] );
|
||||
|
||||
Rzz[i] = gupxz[i]*gupxz[i]*Axx[i]
|
||||
double azz = gupxz[i]*gupxz[i]*Axx[i]
|
||||
+ gupyz[i]*gupyz[i]*Ayy[i]
|
||||
+ gupzz[i]*gupzz[i]*Azz[i]
|
||||
+ TWO * ( gupxz[i]*gupyz[i]*Axy[i]
|
||||
+ gupxz[i]*gupzz[i]*Axz[i]
|
||||
+ gupyz[i]*gupzz[i]*Ayz[i] );
|
||||
|
||||
Rxy[i] = gupxx[i]*gupxy[i]*Axx[i]
|
||||
double axy = gupxx[i]*gupxy[i]*Axx[i]
|
||||
+ gupxy[i]*gupyy[i]*Ayy[i]
|
||||
+ gupxz[i]*gupyz[i]*Azz[i]
|
||||
+ ( gupxx[i]*gupyy[i] + gupxy[i]*gupxy[i] ) * Axy[i]
|
||||
+ ( gupxx[i]*gupyz[i] + gupxz[i]*gupxy[i] ) * Axz[i]
|
||||
+ ( gupxy[i]*gupyz[i] + gupxz[i]*gupyy[i] ) * Ayz[i];
|
||||
|
||||
Rxz[i] = gupxx[i]*gupxz[i]*Axx[i]
|
||||
double axz = gupxx[i]*gupxz[i]*Axx[i]
|
||||
+ gupxy[i]*gupyz[i]*Ayy[i]
|
||||
+ gupxz[i]*gupzz[i]*Azz[i]
|
||||
+ ( gupxx[i]*gupyz[i] + gupxy[i]*gupxz[i] ) * Axy[i]
|
||||
+ ( gupxx[i]*gupzz[i] + gupxz[i]*gupxz[i] ) * Axz[i]
|
||||
+ ( gupxy[i]*gupzz[i] + gupxz[i]*gupyz[i] ) * Ayz[i];
|
||||
|
||||
Ryz[i] = gupxy[i]*gupxz[i]*Axx[i]
|
||||
double ayz = gupxy[i]*gupxz[i]*Axx[i]
|
||||
+ gupyy[i]*gupyz[i]*Ayy[i]
|
||||
+ gupyz[i]*gupzz[i]*Azz[i]
|
||||
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
|
||||
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
|
||||
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
|
||||
}
|
||||
// 4ms //
|
||||
for(int i=0;i<all;i+=1){
|
||||
Gamx_rhs[i] = - TWO * ( Lapx[i] * Rxx[i] + Lapy[i] * Rxy[i] + Lapz[i] * Rxz[i] ) +
|
||||
TWO * alpn1[i] * (
|
||||
-F3o2/chin1[i] * ( chix[i] * Rxx[i] + chiy[i] * Rxy[i] + chiz[i] * Rxz[i] ) -
|
||||
gupxx[i] * ( F2o3 * Kx[i] + EIGHT * PI * Sx[i] ) -
|
||||
gupxy[i] * ( F2o3 * Ky[i] + EIGHT * PI * Sy[i] ) -
|
||||
gupxz[i] * ( F2o3 * Kz[i] + EIGHT * PI * Sz[i] ) +
|
||||
Gamxxx[i] * Rxx[i] + Gamxyy[i] * Ryy[i] + Gamxzz[i] * Rzz[i] +
|
||||
TWO * ( Gamxxy[i] * Rxy[i] + Gamxxz[i] * Rxz[i] + Gamxyz[i] * Ryz[i] ) );
|
||||
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
|
||||
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
|
||||
|
||||
Gamy_rhs[i] = -TWO * ( Lapx[i]*Rxy[i] + Lapy[i]*Ryy[i] + Lapz[i]*Ryz[i] )
|
||||
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
|
||||
TWO * alpn1[i] * (
|
||||
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
|
||||
gupxx[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] ) -
|
||||
gupxy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] ) -
|
||||
gupxz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] ) +
|
||||
Gamxxx[i]*axx + Gamxyy[i]*ayy + Gamxzz[i]*azz +
|
||||
TWO * ( Gamxxy[i]*axy + Gamxxz[i]*axz + Gamxyz[i]*ayz ) );
|
||||
|
||||
Gamy_rhs[i] = -TWO * ( Lapx[i]*axy + Lapy[i]*ayy + Lapz[i]*ayz )
|
||||
+ TWO * alpn1[i] * (
|
||||
-F3o2/chin1[i] * ( chix[i]*Rxy[i] + chiy[i]*Ryy[i] + chiz[i]*Ryz[i] )
|
||||
-F3o2/chin1[i] * ( chix[i]*axy + chiy[i]*ayy + chiz[i]*ayz )
|
||||
- gupxy[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
||||
- gupyy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
||||
- gupyz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
||||
+ Gamyxx[i]*Rxx[i] + Gamyyy[i]*Ryy[i] + Gamyzz[i]*Rzz[i]
|
||||
+ TWO * ( Gamyxy[i]*Rxy[i] + Gamyxz[i]*Rxz[i] + Gamyyz[i]*Ryz[i] )
|
||||
+ Gamyxx[i]*axx + Gamyyy[i]*ayy + Gamyzz[i]*azz
|
||||
+ TWO * ( Gamyxy[i]*axy + Gamyxz[i]*axz + Gamyyz[i]*ayz )
|
||||
);
|
||||
|
||||
Gamz_rhs[i] = -TWO * ( Lapx[i]*Rxz[i] + Lapy[i]*Ryz[i] + Lapz[i]*Rzz[i] )
|
||||
Gamz_rhs[i] = -TWO * ( Lapx[i]*axz + Lapy[i]*ayz + Lapz[i]*azz )
|
||||
+ TWO * alpn1[i] * (
|
||||
-F3o2/chin1[i] * ( chix[i]*Rxz[i] + chiy[i]*Ryz[i] + chiz[i]*Rzz[i] )
|
||||
-F3o2/chin1[i] * ( chix[i]*axz + chiy[i]*ayz + chiz[i]*azz )
|
||||
- gupxz[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
||||
- gupyz[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
||||
- gupzz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
||||
+ Gamzxx[i]*Rxx[i] + Gamzyy[i]*Ryy[i] + Gamzzz[i]*Rzz[i]
|
||||
+ TWO * ( Gamzxy[i]*Rxy[i] + Gamzxz[i]*Rxz[i] + Gamzyz[i]*Ryz[i] )
|
||||
+ Gamzxx[i]*axx + Gamzyy[i]*ayy + Gamzzz[i]*azz
|
||||
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
|
||||
);
|
||||
}
|
||||
// 22.3ms //
|
||||
@@ -365,65 +326,63 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
|
||||
fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
|
||||
|
||||
// 3.5ms //
|
||||
// Fused: fxx/Gamxa + Gamma_rhs part 2 (2 loops -> 1)
|
||||
for(int i=0;i<all;i+=1){
|
||||
fxx[i] = gxxx[i] + gxyy[i] + gxzz[i];
|
||||
fxy[i] = gxyx[i] + gyyy[i] + gyzz[i];
|
||||
fxz[i] = gxzx[i] + gyzy[i] + gzzz[i];
|
||||
Gamxa[i] = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
||||
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||
double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
|
||||
double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
|
||||
double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
|
||||
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
|
||||
|
||||
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
|
||||
|
||||
Gamya[i] = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
||||
double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
|
||||
|
||||
Gamza[i] = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
||||
double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
|
||||
}
|
||||
// 3.9ms //
|
||||
for(int i=0;i<all;i+=1){
|
||||
Gamx_rhs[i] = Gamx_rhs[i]
|
||||
+ F2o3 * Gamxa[i] * div_beta[i]
|
||||
- Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i]
|
||||
+ F1o3 * ( gupxx[i] * fxx[i] + gupxy[i] * fxy[i] + gupxz[i] * fxz[i] )
|
||||
+ F2o3 * gxa * divb
|
||||
- gxa * betaxx[i] - gya * betaxy[i] - gza * betaxz[i]
|
||||
+ F1o3 * ( gupxx[i] * lfxx + gupxy[i] * lfxy + gupxz[i] * lfxz )
|
||||
+ gupxx[i] * gxxx[i] + gupyy[i] * gyyx[i] + gupzz[i] * gzzx[i]
|
||||
+ TWO * ( gupxy[i] * gxyx[i] + gupxz[i] * gxzx[i] + gupyz[i] * gyzx[i] );
|
||||
|
||||
Gamy_rhs[i] = Gamy_rhs[i]
|
||||
+ F2o3 * Gamya[i] * div_beta[i]
|
||||
- Gamxa[i] * betayx[i] - Gamya[i] * betayy[i] - Gamza[i] * betayz[i]
|
||||
+ F1o3 * ( gupxy[i] * fxx[i] + gupyy[i] * fxy[i] + gupyz[i] * fxz[i] )
|
||||
+ F2o3 * gya * divb
|
||||
- gxa * betayx[i] - gya * betayy[i] - gza * betayz[i]
|
||||
+ F1o3 * ( gupxy[i] * lfxx + gupyy[i] * lfxy + gupyz[i] * lfxz )
|
||||
+ gupxx[i] * gxxy[i] + gupyy[i] * gyyy[i] + gupzz[i] * gzzy[i]
|
||||
+ TWO * ( gupxy[i] * gxyy[i] + gupxz[i] * gxzy[i] + gupyz[i] * gyzy[i] );
|
||||
|
||||
Gamz_rhs[i] = Gamz_rhs[i]
|
||||
+ F2o3 * Gamza[i] * div_beta[i]
|
||||
- Gamxa[i] * betazx[i] - Gamya[i] * betazy[i] - Gamza[i] * betazz[i]
|
||||
+ F1o3 * ( gupxz[i] * fxx[i] + gupyz[i] * fxy[i] + gupzz[i] * fxz[i] )
|
||||
+ F2o3 * gza * divb
|
||||
- gxa * betazx[i] - gya * betazy[i] - gza * betazz[i]
|
||||
+ F1o3 * ( gupxz[i] * lfxx + gupyz[i] * lfxy + gupzz[i] * lfxz )
|
||||
+ gupxx[i] * gxxz[i] + gupyy[i] * gyyz[i] + gupzz[i] * gzzz[i]
|
||||
+ TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
|
||||
}
|
||||
// 4.4ms //
|
||||
for (int i=0;i<all;i+=1) {
|
||||
gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
|
||||
gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
|
||||
gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
|
||||
gyyx[i] = gxx[i]*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
|
||||
gyzx[i] = gxx[i]*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
|
||||
gzzx[i] = gxx[i]*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];
|
||||
gxxx[i] = (dxx[i] + ONE)*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
|
||||
gxyx[i] = (dxx[i] + ONE)*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
|
||||
gxzx[i] = (dxx[i] + ONE)*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
|
||||
gyyx[i] = (dxx[i] + ONE)*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
|
||||
gyzx[i] = (dxx[i] + ONE)*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
|
||||
gzzx[i] = (dxx[i] + ONE)*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];
|
||||
|
||||
gxxy[i] = gxy[i]*Gamxxx[i] + gyy[i]*Gamyxx[i] + gyz[i]*Gamzxx[i];
|
||||
gxyy[i] = gxy[i]*Gamxxy[i] + gyy[i]*Gamyxy[i] + gyz[i]*Gamzxy[i];
|
||||
gxzy[i] = gxy[i]*Gamxxz[i] + gyy[i]*Gamyxz[i] + gyz[i]*Gamzxz[i];
|
||||
gyyy[i] = gxy[i]*Gamxyy[i] + gyy[i]*Gamyyy[i] + gyz[i]*Gamzyy[i];
|
||||
gyzy[i] = gxy[i]*Gamxyz[i] + gyy[i]*Gamyyz[i] + gyz[i]*Gamzyz[i];
|
||||
gzzy[i] = gxy[i]*Gamxzz[i] + gyy[i]*Gamyzz[i] + gyz[i]*Gamzzz[i];
|
||||
gxxy[i] = gxy[i]*Gamxxx[i] + (dyy[i] + ONE)*Gamyxx[i] + gyz[i]*Gamzxx[i];
|
||||
gxyy[i] = gxy[i]*Gamxxy[i] + (dyy[i] + ONE)*Gamyxy[i] + gyz[i]*Gamzxy[i];
|
||||
gxzy[i] = gxy[i]*Gamxxz[i] + (dyy[i] + ONE)*Gamyxz[i] + gyz[i]*Gamzxz[i];
|
||||
gyyy[i] = gxy[i]*Gamxyy[i] + (dyy[i] + ONE)*Gamyyy[i] + gyz[i]*Gamzyy[i];
|
||||
gyzy[i] = gxy[i]*Gamxyz[i] + (dyy[i] + ONE)*Gamyyz[i] + gyz[i]*Gamzyz[i];
|
||||
gzzy[i] = gxy[i]*Gamxzz[i] + (dyy[i] + ONE)*Gamyzz[i] + gyz[i]*Gamzzz[i];
|
||||
|
||||
gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + gzz[i]*Gamzxx[i];
|
||||
gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + gzz[i]*Gamzxy[i];
|
||||
gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + gzz[i]*Gamzxz[i];
|
||||
gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + gzz[i]*Gamzyy[i];
|
||||
gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + gzz[i]*Gamzyz[i];
|
||||
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i];
|
||||
gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + (dzz[i] + ONE)*Gamzxx[i];
|
||||
gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + (dzz[i] + ONE)*Gamzxy[i];
|
||||
gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + (dzz[i] + ONE)*Gamzxz[i];
|
||||
gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + (dzz[i] + ONE)*Gamzyy[i];
|
||||
gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + (dzz[i] + ONE)*Gamzyz[i];
|
||||
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + (dzz[i] + ONE)*Gamzzz[i];
|
||||
}
|
||||
// 22.2ms //
|
||||
fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
|
||||
@@ -471,10 +430,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
// 14ms //
|
||||
/* 假设 all = ex1*ex2*ex3,所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */
|
||||
for (int i = 0; i < all; i += 1) {
|
||||
const double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
|
||||
const double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
|
||||
const double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
||||
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
|
||||
|
||||
Rxx[i] =
|
||||
-HALF * Rxx[i]
|
||||
+ gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
|
||||
+ Gamxa[i] * gxxx[i] + Gamya[i] * gxyx[i] + Gamza[i] * gxzx[i]
|
||||
+ (dxx[i] + ONE) * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
|
||||
+ gxa * gxxx[i] + gya * gxyx[i] + gza * gxzx[i]
|
||||
+ gupxx[i] * (
|
||||
TWO * (Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxyx[i] + Gamzxx[i] * gxzx[i]) +
|
||||
(Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxxy[i] + Gamzxx[i] * gxxz[i])
|
||||
@@ -508,8 +474,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
|
||||
Ryy[i] =
|
||||
-HALF * Ryy[i]
|
||||
+ gxy[i] * Gamxy[i] + gyy[i] * Gamyy[i] + gyz[i] * Gamzy[i]
|
||||
+ Gamxa[i] * gxyy[i] + Gamya[i] * gyyy[i] + Gamza[i] * gyzy[i]
|
||||
+ gxy[i] * Gamxy[i] + (dyy[i] + ONE) * Gamyy[i] + gyz[i] * Gamzy[i]
|
||||
+ gxa * gxyy[i] + gya * gyyy[i] + gza * gyzy[i]
|
||||
+ gupxx[i] * (
|
||||
TWO * (Gamxxy[i] * gxxy[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxzy[i]) +
|
||||
(Gamxxy[i] * gxyx[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxyz[i])
|
||||
@@ -543,8 +509,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
|
||||
Rzz[i] =
|
||||
-HALF * Rzz[i]
|
||||
+ gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + gzz[i] * Gamzz[i]
|
||||
+ Gamxa[i] * gxzz[i] + Gamya[i] * gyzz[i] + Gamza[i] * gzzz[i]
|
||||
+ gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + (dzz[i] + ONE) * Gamzz[i]
|
||||
+ gxa * gxzz[i] + gya * gyzz[i] + gza * gzzz[i]
|
||||
+ gupxx[i] * (
|
||||
TWO * (Gamxxz[i] * gxxz[i] + Gamyxz[i] * gxyz[i] + Gamzxz[i] * gxzz[i]) +
|
||||
(Gamxxz[i] * gxzx[i] + Gamyxz[i] * gxzy[i] + Gamzxz[i] * gxzz[i])
|
||||
@@ -579,10 +545,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
Rxy[i] =
|
||||
HALF * (
|
||||
-Rxy[i]
|
||||
+ gxx[i] * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
|
||||
+ gxy[i] * Gamxx[i] + gyy[i] * Gamyx[i] + gyz[i] * Gamzx[i]
|
||||
+ Gamxa[i] * gxyx[i] + Gamya[i] * gyyx[i] + Gamza[i] * gyzx[i]
|
||||
+ Gamxa[i] * gxxy[i] + Gamya[i] * gxyy[i] + Gamza[i] * gxzy[i]
|
||||
+ (dxx[i] + ONE) * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
|
||||
+ gxy[i] * Gamxx[i] + (dyy[i] + ONE) * Gamyx[i] + gyz[i] * Gamzx[i]
|
||||
+ gxa * gxyx[i] + gya * gyyx[i] + gza * gyzx[i]
|
||||
+ gxa * gxxy[i] + gya * gxyy[i] + gza * gxzy[i]
|
||||
)
|
||||
+ gupxx[i] * (
|
||||
Gamxxx[i] * gxxy[i] + Gamyxx[i] * gxyy[i] + Gamzxx[i] * gxzy[i]
|
||||
@@ -627,10 +593,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
Rxz[i] =
|
||||
HALF * (
|
||||
-Rxz[i]
|
||||
+ gxx[i] * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
|
||||
+ gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + gzz[i] * Gamzx[i]
|
||||
+ Gamxa[i] * gxzx[i] + Gamya[i] * gyzx[i] + Gamza[i] * gzzx[i]
|
||||
+ Gamxa[i] * gxxz[i] + Gamya[i] * gxyz[i] + Gamza[i] * gxzz[i]
|
||||
+ (dxx[i] + ONE) * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
|
||||
+ gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + (dzz[i] + ONE) * Gamzx[i]
|
||||
+ gxa * gxzx[i] + gya * gyzx[i] + gza * gzzx[i]
|
||||
+ gxa * gxxz[i] + gya * gxyz[i] + gza * gxzz[i]
|
||||
)
|
||||
+ gupxx[i] * (
|
||||
Gamxxx[i] * gxxz[i] + Gamyxx[i] * gxyz[i] + Gamzxx[i] * gxzz[i]
|
||||
@@ -675,10 +641,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
Ryz[i] =
|
||||
HALF * (
|
||||
-Ryz[i]
|
||||
+ gxy[i] * Gamxz[i] + gyy[i] * Gamyz[i] + gyz[i] * Gamzz[i]
|
||||
+ gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + gzz[i] * Gamzy[i]
|
||||
+ Gamxa[i] * gxzy[i] + Gamya[i] * gyzy[i] + Gamza[i] * gzzy[i]
|
||||
+ Gamxa[i] * gxyz[i] + Gamya[i] * gyyz[i] + Gamza[i] * gyzz[i]
|
||||
+ gxy[i] * Gamxz[i] + (dyy[i] + ONE) * Gamyz[i] + gyz[i] * Gamzz[i]
|
||||
+ gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + (dzz[i] + ONE) * Gamzy[i]
|
||||
+ gxa * gxzy[i] + gya * gyzy[i] + gza * gzzy[i]
|
||||
+ gxa * gxyz[i] + gya * gyyz[i] + gza * gyzz[i]
|
||||
)
|
||||
+ gupxx[i] * (
|
||||
Gamxxy[i] * gxxz[i] + Gamyxy[i] * gxyz[i] + Gamzxy[i] * gxzz[i]
|
||||
@@ -739,9 +705,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
|
||||
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
|
||||
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
|
||||
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + gxx[i] * f[i] ) / (chin1[i] * TWO);
|
||||
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + gyy[i] * f[i] ) / (chin1[i] * TWO);
|
||||
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + gzz[i] * f[i] ) / (chin1[i] * TWO);
|
||||
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||
|
||||
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
|
||||
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
|
||||
@@ -760,17 +726,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];
|
||||
|
||||
/* Christoffel 修正项 */
|
||||
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - gxx[i] * gxxx[i] ) * HALF;
|
||||
Gamyxx[i] = Gamyxx[i] - ( 0.0 - gxx[i] * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
|
||||
Gamzxx[i] = Gamzxx[i] - ( 0.0 - gxx[i] * gxxz[i] ) * HALF;
|
||||
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF;
|
||||
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
|
||||
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF;
|
||||
|
||||
Gamxyy[i] = Gamxyy[i] - ( 0.0 - gyy[i] * gxxx[i] ) * HALF;
|
||||
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - gyy[i] * gxxy[i] ) * HALF;
|
||||
Gamzyy[i] = Gamzyy[i] - ( 0.0 - gyy[i] * gxxz[i] ) * HALF;
|
||||
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF;
|
||||
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF;
|
||||
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF;
|
||||
|
||||
Gamxzz[i] = Gamxzz[i] - ( 0.0 - gzz[i] * gxxx[i] ) * HALF;
|
||||
Gamyzz[i] = Gamyzz[i] - ( 0.0 - gzz[i] * gxxy[i] ) * HALF;
|
||||
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - gzz[i] * gxxz[i] ) * HALF;
|
||||
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF;
|
||||
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF;
|
||||
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF;
|
||||
|
||||
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF;
|
||||
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF;
|
||||
@@ -792,14 +758,13 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
fxy[i] = fxy[i] - Gamxxy[i] * Lapx[i] - Gamyxy[i] * Lapy[i] - Gamzxy[i] * Lapz[i];
|
||||
fxz[i] = fxz[i] - Gamxxz[i] * Lapx[i] - Gamyxz[i] * Lapy[i] - Gamzxz[i] * Lapz[i];
|
||||
fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
|
||||
}
|
||||
// 1ms //
|
||||
for (int i=0;i<all;i+=1) {
|
||||
|
||||
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
|
||||
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
|
||||
}
|
||||
// 2.5ms //
|
||||
for (int i=0;i<all;i+=1) {
|
||||
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||
|
||||
S[i] = chin1[i] * (
|
||||
gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
|
||||
@@ -850,23 +815,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
+ (alpn1[i] / chin1[i]) * f[i]
|
||||
);
|
||||
|
||||
fxx[i] = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
|
||||
fxy[i] = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
|
||||
fxz[i] = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
|
||||
fyy[i] = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
|
||||
fyz[i] = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
|
||||
fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
|
||||
}
|
||||
// 8ms //
|
||||
for (int i=0;i<all;i+=1) {
|
||||
double l_fxx = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
|
||||
double l_fxy = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
|
||||
double l_fxz = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
|
||||
double l_fyy = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
|
||||
double l_fyz = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
|
||||
double l_fzz = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
|
||||
|
||||
/* Aij_rhs = fij - gij * f */
|
||||
Axx_rhs[i] = fxx[i] - gxx[i] * f[i];
|
||||
Ayy_rhs[i] = fyy[i] - gyy[i] * f[i];
|
||||
Azz_rhs[i] = fzz[i] - gzz[i] * f[i];
|
||||
Axy_rhs[i] = fxy[i] - gxy[i] * f[i];
|
||||
Axz_rhs[i] = fxz[i] - gxz[i] * f[i];
|
||||
Ayz_rhs[i] = fyz[i] - gyz[i] * f[i];
|
||||
Axx_rhs[i] = l_fxx - (dxx[i] + ONE) * f[i];
|
||||
Ayy_rhs[i] = l_fyy - (dyy[i] + ONE) * f[i];
|
||||
Azz_rhs[i] = l_fzz - (dzz[i] + ONE) * f[i];
|
||||
Axy_rhs[i] = l_fxy - gxy[i] * f[i];
|
||||
Axz_rhs[i] = l_fxz - gxz[i] * f[i];
|
||||
Ayz_rhs[i] = l_fyz - gyz[i] * f[i];
|
||||
|
||||
/* Now: store A_il A^l_j into fij: */
|
||||
fxx[i] =
|
||||
@@ -928,19 +890,19 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
f[i] * Axx_rhs[i]
|
||||
+ alpn1[i] * ( trK[i] * Axx[i] - TWO * fxx[i] )
|
||||
+ TWO * ( Axx[i] * betaxx[i] + Axy[i] * betayx[i] + Axz[i] * betazx[i] )
|
||||
- F2o3 * Axx[i] * div_beta[i];
|
||||
- F2o3 * Axx[i] * divb;
|
||||
|
||||
Ayy_rhs[i] =
|
||||
f[i] * Ayy_rhs[i]
|
||||
+ alpn1[i] * ( trK[i] * Ayy[i] - TWO * fyy[i] )
|
||||
+ TWO * ( Axy[i] * betaxy[i] + Ayy[i] * betayy[i] + Ayz[i] * betazy[i] )
|
||||
- F2o3 * Ayy[i] * div_beta[i];
|
||||
- F2o3 * Ayy[i] * divb;
|
||||
|
||||
Azz_rhs[i] =
|
||||
f[i] * Azz_rhs[i]
|
||||
+ alpn1[i] * ( trK[i] * Azz[i] - TWO * fzz[i] )
|
||||
+ TWO * ( Axz[i] * betaxz[i] + Ayz[i] * betayz[i] + Azz[i] * betazz[i] )
|
||||
- F2o3 * Azz[i] * div_beta[i];
|
||||
- F2o3 * Azz[i] * divb;
|
||||
|
||||
Axy_rhs[i] =
|
||||
f[i] * Axy_rhs[i]
|
||||
@@ -949,7 +911,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
+ Axz[i] * betazy[i]
|
||||
+ Ayy[i] * betayx[i]
|
||||
+ Ayz[i] * betazx[i]
|
||||
+ F1o3 * Axy[i] * div_beta[i]
|
||||
+ F1o3 * Axy[i] * divb
|
||||
- Axy[i] * betazz[i];
|
||||
|
||||
Ayz_rhs[i] =
|
||||
@@ -959,7 +921,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
+ Ayy[i] * betayz[i]
|
||||
+ Axz[i] * betaxy[i]
|
||||
+ Azz[i] * betazy[i]
|
||||
+ F1o3 * Ayz[i] * div_beta[i]
|
||||
+ F1o3 * Ayz[i] * divb
|
||||
- Ayz[i] * betaxx[i];
|
||||
|
||||
Axz_rhs[i] =
|
||||
@@ -969,7 +931,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
+ Axy[i] * betayz[i]
|
||||
+ Ayz[i] * betayx[i]
|
||||
+ Azz[i] * betazx[i]
|
||||
+ F1o3 * Axz[i] * div_beta[i]
|
||||
+ F1o3 * Axz[i] * divb
|
||||
- Axz[i] * betayy[i];
|
||||
|
||||
/* Compute trace of S_ij */
|
||||
@@ -1100,58 +1062,31 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
||||
#endif
|
||||
}
|
||||
// 26ms //
|
||||
lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
|
||||
lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
|
||||
lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
|
||||
lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
|
||||
lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
|
||||
lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
|
||||
lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
|
||||
lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
|
||||
lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
|
||||
lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
|
||||
lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
|
||||
lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA);
|
||||
lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA);
|
||||
lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS);
|
||||
lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS);
|
||||
lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS);
|
||||
// 20ms //
|
||||
if(eps>0){
|
||||
kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
|
||||
kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
|
||||
}
|
||||
// advection + KO dissipation with shared symmetry buffer
|
||||
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||
lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||
// 2ms //
|
||||
if(co==0){
|
||||
for (int i=0;i<all;i+=1) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,36 +0,0 @@
|
||||
#ifndef BSSN_RHS_CUDA_H
|
||||
#define BSSN_RHS_CUDA_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int f_compute_rhs_bssn(int *ex, double &T,
|
||||
double *X, double *Y, double *Z,
|
||||
double *chi, double *trK,
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -141,26 +141,12 @@ void fdderivs(const int ex[3],
|
||||
const int j4_hi = ex2 - 3;
|
||||
const int k4_hi = ex3 - 3;
|
||||
|
||||
/*
|
||||
* Strategy A:
|
||||
* Avoid redundant work in overlap of 2nd/4th-order regions.
|
||||
* Only compute 2nd-order on shell points that are NOT overwritten by
|
||||
* the 4th-order pass.
|
||||
*/
|
||||
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
|
||||
|
||||
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
||||
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
||||
if (has4 &&
|
||||
i0 >= i4_lo && i0 <= i4_hi &&
|
||||
j0 >= j4_lo && j0 <= j4_hi &&
|
||||
k0 >= k4_lo && k0 <= k4_hi) {
|
||||
continue;
|
||||
}
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
@@ -207,7 +193,7 @@ void fdderivs(const int ex[3],
|
||||
}
|
||||
}
|
||||
|
||||
if (has4) {
|
||||
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
|
||||
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
||||
|
||||
@@ -81,26 +81,63 @@ void fderivs(const int ex[3],
|
||||
}
|
||||
|
||||
/*
|
||||
* Fortran loops:
|
||||
* do k=1,ex3-1
|
||||
* do j=1,ex2-1
|
||||
* do i=1,ex1-1
|
||||
* 两段式:
|
||||
* 1) 先在二阶可用区域计算二阶模板
|
||||
* 2) 再在高阶可用区域覆盖为四阶模板
|
||||
*
|
||||
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
|
||||
* 与原 if/elseif 逻辑等价,但减少逐点分支判断。
|
||||
*/
|
||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
const int i2_lo = (iminF > 0) ? iminF : 0;
|
||||
const int j2_lo = (jminF > 0) ? jminF : 0;
|
||||
const int k2_lo = (kminF > 0) ? kminF : 0;
|
||||
const int i2_hi = ex1 - 2;
|
||||
const int j2_hi = ex2 - 2;
|
||||
const int k2_hi = ex3 - 2;
|
||||
|
||||
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
|
||||
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
|
||||
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
|
||||
const int i4_hi = ex1 - 3;
|
||||
const int j4_hi = ex2 - 3;
|
||||
const int k4_hi = ex3 - 3;
|
||||
|
||||
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
||||
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
fx[p] = d2dx * (
|
||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fy[p] = d2dy * (
|
||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fz[p] = d2dz * (
|
||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
|
||||
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
fx[p] = d12dx * (
|
||||
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
||||
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||
@@ -122,29 +159,9 @@ void fderivs(const int ex[3],
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
||||
);
|
||||
}
|
||||
// elseif(i+1 <= imax .and. i-1 >= imin ...)
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
fx[p] = d2dx * (
|
||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fy[p] = d2dy * (
|
||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fz[p] = d2dz * (
|
||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free(fh);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1327,35 +1327,6 @@ end subroutine d2dump
|
||||
|
||||
return
|
||||
end subroutine polint
|
||||
|
||||
subroutine polint0(xa, ya, y, ordn)
|
||||
! Lagrange interpolation at x=0, O(n) direct formula
|
||||
implicit none
|
||||
integer, intent(in) :: ordn
|
||||
real*8, dimension(ordn), intent(in) :: xa, ya
|
||||
real*8, intent(out) :: y
|
||||
|
||||
integer :: j, k
|
||||
real*8 :: wj
|
||||
|
||||
y = 0.d0
|
||||
do j = 1, ordn
|
||||
wj = 1.d0
|
||||
do k = 1, ordn
|
||||
if (k .ne. j) then
|
||||
wj = wj * xa(k) / (xa(k) - xa(j))
|
||||
endif
|
||||
enddo
|
||||
y = y + wj * ya(j)
|
||||
enddo
|
||||
|
||||
return
|
||||
end subroutine polint0
|
||||
!------------------------------------------------------------------------------
|
||||
!
|
||||
! interpolation in 2 dimensions, follow yx order
|
||||
!
|
||||
!------------------------------------------------------------------------------
|
||||
!------------------------------------------------------------------------------
|
||||
! Compute Lagrange interpolation basis weights for one target point.
|
||||
!------------------------------------------------------------------------------
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
|
||||
/* 更新:负载均衡问题已经通过优化插值函数解决,此profile静态均衡方案已弃用,本头文件现在未参与编译 */
|
||||
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
||||
#ifndef INTERP_LB_PROFILE_DATA_H
|
||||
#define INTERP_LB_PROFILE_DATA_H
|
||||
|
||||
@@ -63,19 +63,28 @@ void kodis(const int ex[3],
|
||||
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
||||
* 并定义 Fortran index: iF=i0+1, ...
|
||||
*/
|
||||
for (int k0 = 0; k0 < ex3; ++k0) {
|
||||
// 收紧循环范围:只遍历满足 iF±3/jF±3/kF±3 条件的内部点
|
||||
// iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
|
||||
// iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
|
||||
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
|
||||
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
|
||||
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
|
||||
const int i0_hi = imaxF - 4; // inclusive
|
||||
const int j0_hi = jmaxF - 4;
|
||||
const int k0_hi = kmaxF - 4;
|
||||
|
||||
if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
|
||||
free(fh);
|
||||
return;
|
||||
}
|
||||
|
||||
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
||||
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
|
||||
// Fortran if 条件:
|
||||
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
|
||||
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
||||
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
||||
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
||||
{
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
||||
@@ -100,7 +109,6 @@ void kodis(const int ex[3],
|
||||
// Fortran:
|
||||
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
248
AMSS_NCKU_source/lopsided_kodis_c.C
Normal file
248
AMSS_NCKU_source/lopsided_kodis_c.C
Normal file
@@ -0,0 +1,248 @@
|
||||
#include "tool.h"
|
||||
|
||||
/*
|
||||
* Combined advection (lopsided) + KO dissipation (kodis).
|
||||
* Uses one shared symmetry_bd buffer per call.
|
||||
*/
|
||||
void lopsided_kodis(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||
int Symmetry, const double SoA[3], double eps)
|
||||
{
|
||||
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
|
||||
const double F6 = 6.0, F18 = 18.0;
|
||||
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
|
||||
const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
|
||||
const double cof = 64.0; // 2^6
|
||||
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
||||
|
||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
|
||||
const double d12dx = ONE / F12 / dX;
|
||||
const double d12dy = ONE / F12 / dY;
|
||||
const double d12dz = ONE / F12 / dZ;
|
||||
|
||||
const int imaxF = ex1;
|
||||
const int jmaxF = ex2;
|
||||
const int kmaxF = ex3;
|
||||
|
||||
int iminF = 1, jminF = 1, kminF = 1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
|
||||
|
||||
// fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
|
||||
const size_t nx = (size_t)ex1 + 3;
|
||||
const size_t ny = (size_t)ex2 + 3;
|
||||
const size_t nz = (size_t)ex3 + 3;
|
||||
const size_t fh_size = nx * ny * nz;
|
||||
|
||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||
if (!fh) return;
|
||||
|
||||
symmetry_bd(3, ex, f, fh, SoA);
|
||||
|
||||
// Advection (same stencil logic as lopsided_c.C)
|
||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
const double sfx = Sfx[p];
|
||||
if (sfx > ZEO) {
|
||||
if (i0 <= ex1 - 4) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||
} else if (i0 <= ex1 - 3) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||
} else if (i0 <= ex1 - 2) {
|
||||
f_rhs[p] -= sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||
}
|
||||
} else if (sfx < ZEO) {
|
||||
if ((i0 - 2) >= iminF) {
|
||||
f_rhs[p] -= sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||
} else if ((i0 - 1) >= iminF) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||
} else if (i0 >= iminF) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||
}
|
||||
}
|
||||
|
||||
const double sfy = Sfy[p];
|
||||
if (sfy > ZEO) {
|
||||
if (j0 <= ex2 - 4) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||
} else if (j0 <= ex2 - 3) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||
} else if (j0 <= ex2 - 2) {
|
||||
f_rhs[p] -= sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||
}
|
||||
} else if (sfy < ZEO) {
|
||||
if ((j0 - 2) >= jminF) {
|
||||
f_rhs[p] -= sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||
} else if ((j0 - 1) >= jminF) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||
} else if (j0 >= jminF) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||
}
|
||||
}
|
||||
|
||||
const double sfz = Sfz[p];
|
||||
if (sfz > ZEO) {
|
||||
if (k0 <= ex3 - 4) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||
} else if (k0 <= ex3 - 3) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||
} else if (k0 <= ex3 - 2) {
|
||||
f_rhs[p] -= sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||
}
|
||||
} else if (sfz < ZEO) {
|
||||
if ((k0 - 2) >= kminF) {
|
||||
f_rhs[p] -= sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||
} else if ((k0 - 1) >= kminF) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||
} else if (k0 >= kminF) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// KO dissipation (same domain restriction as kodiss_c.C)
|
||||
if (eps > ZEO) {
|
||||
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
|
||||
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
|
||||
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
|
||||
const int i0_hi = imaxF - 4; // inclusive
|
||||
const int j0_hi = jmaxF - 4;
|
||||
const int k0_hi = kmaxF - 4;
|
||||
|
||||
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
|
||||
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
const double Dx_term =
|
||||
((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dX;
|
||||
|
||||
const double Dy_term =
|
||||
((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dY;
|
||||
|
||||
const double Dz_term =
|
||||
((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dZ;
|
||||
|
||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(fh);
|
||||
}
|
||||
@@ -1,33 +1,35 @@
|
||||
|
||||
|
||||
include makefile.inc
|
||||
|
||||
## polint(ordn=6) kernel selector:
|
||||
## 1 (default): barycentric fast path
|
||||
## 0 : fallback to Neville path
|
||||
POLINT6_USE_BARY ?= 1
|
||||
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
include makefile.inc
|
||||
|
||||
## polint(ordn=6) kernel selector:
|
||||
## 1 (default): barycentric fast path
|
||||
## 0 : fallback to Neville path
|
||||
POLINT6_USE_BARY ?= 1
|
||||
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(PROFDATA) \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
endif
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||
## PGO has been turned off, now tested and found to be negative optimization
|
||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||
|
||||
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
endif
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
@@ -43,10 +45,6 @@ endif
|
||||
.cu.o:
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
|
||||
bssn_rhs_cuda.o: bssn_rhs_cuda.cu macrodef.h
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# C rewrite of BSSN RHS kernel and helpers
|
||||
bssn_rhs_c.o: bssn_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
@@ -60,11 +58,14 @@ fdderivs_c.o: fdderivs_c.C
|
||||
kodiss_c.o: kodiss_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||
@@ -81,25 +82,21 @@ TwoPunctureABE.o: TwoPunctureABE.C
|
||||
# Input files
|
||||
|
||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
||||
CFILES =
|
||||
else
|
||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
|
||||
endif
|
||||
|
||||
# CUDA rewrite: bssn_rhs_cuda.o replaces all CFILES (stencils are built-in)
|
||||
CFILES_CUDA = bssn_rhs_cuda.o
|
||||
|
||||
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
||||
ifeq ($(USE_CXX_RK4),1)
|
||||
CFILES += rungekutta4_rout_c.o
|
||||
CFILES_CUDA += rungekutta4_rout_c.o
|
||||
RK4_F90_OBJ =
|
||||
else
|
||||
RK4_F90_OBJ = rungekutta4_rout.o
|
||||
endif
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
||||
CFILES =
|
||||
else
|
||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
|
||||
endif
|
||||
|
||||
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
||||
ifeq ($(USE_CXX_RK4),1)
|
||||
CFILES += rungekutta4_rout_c.o
|
||||
RK4_F90_OBJ =
|
||||
else
|
||||
RK4_F90_OBJ = rungekutta4_rout.o
|
||||
endif
|
||||
|
||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||
@@ -116,12 +113,12 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
|
||||
NullShellPatch2_Evo.o \
|
||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
||||
|
||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||
fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
|
||||
cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
|
||||
getnpem2.o empart.o NullNews.o fourdcurvature.o\
|
||||
@@ -181,12 +178,9 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
|
||||
misc.o : zbesh.o
|
||||
|
||||
# projects
|
||||
ABE: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
|
||||
|
||||
ABE_CUDA: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
|
||||
|
||||
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||
|
||||
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||
|
||||
@@ -194,4 +188,4 @@ TwoPunctureABE: $(TwoPunctureFILES)
|
||||
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||
|
||||
clean:
|
||||
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
|
||||
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
||||
|
||||
@@ -62,4 +62,4 @@ CLINKER = mpiicpx
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
|
||||
@@ -217,6 +217,7 @@
|
||||
real*8,dimension(2*ghost_width) :: X,Y,Z
|
||||
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
||||
real*8, dimension(2*ghost_width) :: tmp1
|
||||
real*8 :: ddy
|
||||
real*8,dimension(3) :: ccp
|
||||
|
||||
#if (ghost_width == 2)
|
||||
@@ -579,7 +580,7 @@
|
||||
tmp1(ghost_width-cxI(1)+cxB(1) :ghost_width-cxI(1)+cxT(1) ) = funf(cxB(1):cxT(1),j,k)
|
||||
endif
|
||||
|
||||
call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
! for y direction
|
||||
elseif(sum(fg).eq.2.and.fg(2) .eq. 0.and. &
|
||||
@@ -689,7 +690,7 @@
|
||||
tmp1(ghost_width-cxI(2)+cxB(2) :ghost_width-cxI(2)+cxT(2) ) = funf(i,cxB(2):cxT(2),k)
|
||||
endif
|
||||
|
||||
call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
! for z direction
|
||||
elseif(sum(fg).eq.2.and.fg(3) .eq. 0.and. &
|
||||
@@ -801,7 +802,7 @@
|
||||
tmp1(ghost_width-cxI(3)+cxB(3) :ghost_width-cxI(3)+cxT(3) ) = funf(i,j,cxB(3):cxT(3))
|
||||
endif
|
||||
|
||||
call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
#else
|
||||
|
||||
@@ -1955,13 +1956,11 @@
|
||||
|
||||
real*8,dimension(3) :: CD,FD
|
||||
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
||||
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
|
||||
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果
|
||||
real*8 :: v1, v2, v3, v4, v5, v6
|
||||
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
|
||||
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
|
||||
logical :: need_full_symmetry
|
||||
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max
|
||||
real*8 :: res_line
|
||||
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
|
||||
real*8 :: tmp_z_slab(extc(1), extc(2)) ! 分配在 k 循环外
|
||||
if(wei.ne.3)then
|
||||
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
||||
write(*,*)"dim = ",wei
|
||||
@@ -2064,41 +2063,24 @@
|
||||
endif
|
||||
enddo
|
||||
|
||||
ic_min = minval(cix(imino:imaxo))
|
||||
ic_max = maxval(cix(imino:imaxo))
|
||||
jc_min = minval(ciy(jmino:jmaxo))
|
||||
jc_max = maxval(ciy(jmino:jmaxo))
|
||||
kc_min = minval(ciz(kmino:kmaxo))
|
||||
kc_max = maxval(ciz(kmino:kmaxo))
|
||||
|
||||
maxcx = ic_max
|
||||
maxcy = jc_max
|
||||
maxcz = kc_max
|
||||
maxcx = maxval(cix(imino:imaxo))
|
||||
maxcy = maxval(ciy(jmino:jmaxo))
|
||||
maxcz = maxval(ciz(kmino:kmaxo))
|
||||
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
|
||||
write(*,*)"error in prolong"
|
||||
return
|
||||
endif
|
||||
|
||||
i_lo = ic_min - 2
|
||||
i_hi = ic_max + 3
|
||||
j_lo = jc_min - 2
|
||||
j_hi = jc_max + 3
|
||||
k_lo = kc_min - 2
|
||||
k_hi = kc_max + 3
|
||||
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
|
||||
if(need_full_symmetry)then
|
||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||
else
|
||||
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
|
||||
endif
|
||||
|
||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
||||
jc_min = minval(ciy(jmino:jmaxo))
|
||||
jc_max = maxval(ciy(jmino:jmaxo))
|
||||
|
||||
do k = kmino, kmaxo
|
||||
pz = piz(k); kc = ciz(k)
|
||||
! --- Pass 1: Z 方向,只算一次 ---
|
||||
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3)
|
||||
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3)
|
||||
do iy = jc_min-3, jc_max+3 ! 仅需的 iy 范围
|
||||
do ii = imini-3, imaxi+3 ! 仅需的 ii 范围
|
||||
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
||||
end do
|
||||
end do
|
||||
@@ -2106,7 +2088,7 @@ do k = kmino, kmaxo
|
||||
do j = jmino, jmaxo
|
||||
py = piy(j); jc = ciy(j)
|
||||
! --- Pass 2: Y 方向 ---
|
||||
do ii = ic_min-2, ic_max+3
|
||||
do ii = imini-3, imaxi+3
|
||||
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
||||
end do
|
||||
! --- Pass 3: X 方向 ---
|
||||
@@ -2369,12 +2351,9 @@ end do
|
||||
|
||||
real*8,dimension(3) :: CD,FD
|
||||
|
||||
real*8 :: tmp_xz_plane(-1:extf(1), 6)
|
||||
real*8 :: tmp_x_line(-1:extf(1))
|
||||
real*8 :: tmp_xz_plane(extf(1), 6)
|
||||
real*8 :: tmp_x_line(extf(1))
|
||||
integer :: fi, fj, fk, ii, jj, kk
|
||||
integer :: fi_min, fi_max, ii_lo, ii_hi
|
||||
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
|
||||
logical :: need_full_symmetry
|
||||
|
||||
if(wei.ne.3)then
|
||||
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
||||
@@ -2454,34 +2433,7 @@ end do
|
||||
stop
|
||||
endif
|
||||
|
||||
! 仅计算 X 向最终写回所需的窗口:
|
||||
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
|
||||
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
|
||||
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
|
||||
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
|
||||
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
|
||||
ii_lo = fi_min - 2
|
||||
ii_hi = fi_max + 3
|
||||
jj_lo = fj_min - 2
|
||||
jj_hi = fj_max + 3
|
||||
kk_lo = fk_min - 2
|
||||
kk_hi = fk_max + 3
|
||||
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
|
||||
jj_lo < -1 .or. jj_hi > extf(2) .or. &
|
||||
kk_lo < -1 .or. kk_hi > extf(3))then
|
||||
write(*,*)"restrict3: invalid stencil window"
|
||||
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
|
||||
write(*,*)"extf=",extf
|
||||
stop
|
||||
endif
|
||||
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
|
||||
if(need_full_symmetry)then
|
||||
call symmetry_bd(2,extf,funf,funff,SoA)
|
||||
else
|
||||
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
|
||||
endif
|
||||
call symmetry_bd(2,extf,funf,funff,SoA)
|
||||
|
||||
!~~~~~~> restriction start...
|
||||
do k = kmino, kmaxo
|
||||
@@ -2493,7 +2445,7 @@ do k = kmino, kmaxo
|
||||
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
||||
! 确保 ii 循环是最内层且连续访问
|
||||
!DIR$ VECTOR ALWAYS
|
||||
do ii = ii_lo, ii_hi
|
||||
do ii = 1, extf(1)
|
||||
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
||||
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
||||
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
||||
@@ -2518,7 +2470,7 @@ do k = kmino, kmaxo
|
||||
|
||||
! 优化点 2: 同样向量化 Y 方向压缩
|
||||
!DIR$ VECTOR ALWAYS
|
||||
do ii = ii_lo, ii_hi
|
||||
do ii = 1, extf(1)
|
||||
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
||||
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
||||
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
||||
|
||||
@@ -217,6 +217,7 @@
|
||||
real*8,dimension(2*ghost_width) :: X,Y,Z
|
||||
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
||||
real*8, dimension(2*ghost_width) :: tmp1
|
||||
real*8 :: ddy
|
||||
|
||||
#if (ghost_width == 2)
|
||||
real*8, parameter :: C1=-1.d0/16,C2=9.d0/16
|
||||
@@ -469,7 +470,7 @@
|
||||
|
||||
tmp1(cxB(1)+ghost_width-i+1:cxT(1)+ghost_width-i+1) = fh(cxB(1):cxT(1),j,k)
|
||||
|
||||
call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
! for y direction
|
||||
elseif (fg(2) .eq. 0)then
|
||||
@@ -528,7 +529,7 @@
|
||||
|
||||
tmp1(cxB(2)+ghost_width-j+1:cxT(2)+ghost_width-j+1) = fh(i,cxB(2):cxT(2),k)
|
||||
|
||||
call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
! for z direction
|
||||
else
|
||||
@@ -587,7 +588,7 @@
|
||||
|
||||
tmp1(cxB(3)+ghost_width-k+1:cxT(3)+ghost_width-k+1) = fh(i,j,cxB(3):cxT(3))
|
||||
|
||||
call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
|
||||
call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||
|
||||
endif
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstddef>
|
||||
#include <complex>
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace {
|
||||
@@ -117,6 +118,62 @@ inline void rk4_stage3(std::size_t n,
|
||||
|
||||
extern "C" {
|
||||
|
||||
void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
|
||||
constexpr double F1o6 = 1.0 / 6.0;
|
||||
constexpr double HLF = 0.5;
|
||||
constexpr double TWO = 2.0;
|
||||
|
||||
switch (RK4) {
|
||||
case 0:
|
||||
f1 = f0 + HLF * dT * f_rhs;
|
||||
break;
|
||||
case 1:
|
||||
f_rhs = f_rhs + TWO * f1;
|
||||
f1 = f0 + HLF * dT * f1;
|
||||
break;
|
||||
case 2:
|
||||
f_rhs = f_rhs + TWO * f1;
|
||||
f1 = f0 + dT * f1;
|
||||
break;
|
||||
case 3:
|
||||
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
|
||||
break;
|
||||
default:
|
||||
std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
void rungekutta4_cplxscalar_(double &dT,
|
||||
std::complex<double> &f0,
|
||||
std::complex<double> &f1,
|
||||
std::complex<double> &f_rhs,
|
||||
int &RK4) {
|
||||
constexpr double F1o6 = 1.0 / 6.0;
|
||||
constexpr double HLF = 0.5;
|
||||
constexpr double TWO = 2.0;
|
||||
|
||||
switch (RK4) {
|
||||
case 0:
|
||||
f1 = f0 + HLF * dT * f_rhs;
|
||||
break;
|
||||
case 1:
|
||||
f_rhs = f_rhs + TWO * f1;
|
||||
f1 = f0 + HLF * dT * f1;
|
||||
break;
|
||||
case 2:
|
||||
f_rhs = f_rhs + TWO * f1;
|
||||
f1 = f0 + dT * f1;
|
||||
break;
|
||||
case 3:
|
||||
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
|
||||
break;
|
||||
default:
|
||||
std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
int f_rungekutta4_rout(int *ex, double &dT,
|
||||
double *f0, double *f1, double *f_rhs,
|
||||
int &RK4) {
|
||||
|
||||
@@ -24,4 +24,10 @@ void lopsided(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||
int Symmetry, const double SoA[3]);
|
||||
int Symmetry, const double SoA[3]);
|
||||
|
||||
void lopsided_kodis(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||
int Symmetry, const double SoA[3], double eps);
|
||||
|
||||
@@ -70,7 +70,7 @@ def makefile_ABE():
|
||||
|
||||
## Build command with CPU binding to nohz_full cores
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||
else:
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
# AMSS-NCKU PGO Profile Analysis Report
|
||||
|
||||
## 1. Profiling Environment
|
||||
|
||||
| Item | Value |
|
||||
|------|-------|
|
||||
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
|
||||
| Instrumentation Flag | `-fprofile-instr-generate` |
|
||||
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
|
||||
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
|
||||
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
|
||||
| Merged Profile | `default.profdata` (394 KB) |
|
||||
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
|
||||
|
||||
## 2. Reduced Simulation Parameters (for profiling run)
|
||||
|
||||
| Parameter | Production Value | Profiling Value |
|
||||
|-----------|-----------------|-----------------|
|
||||
| MPI_processes | 64 | 1 |
|
||||
| grid_level | 9 | 4 |
|
||||
| static_grid_level | 5 | 3 |
|
||||
| static_grid_number | 96 | 24 |
|
||||
| moving_grid_number | 48 | 16 |
|
||||
| largest_box_xyz_max | 320^3 | 160^3 |
|
||||
| Final_Evolution_Time | 1000.0 | 10.0 |
|
||||
| Evolution_Step_Number | 10,000,000 | 1,000 |
|
||||
| Detector_Number | 12 | 2 |
|
||||
|
||||
## 3. Profile Summary
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Total instrumented functions | 1,392 |
|
||||
| Functions with non-zero counts | 117 (8.4%) |
|
||||
| Functions with zero counts | 1,275 (91.6%) |
|
||||
| Maximum function entry count | 386,459,248 |
|
||||
| Maximum internal block count | 370,477,680 |
|
||||
| Total block count | 4,198,023,118 |
|
||||
|
||||
## 4. Top 20 Hotspot Functions
|
||||
|
||||
| Rank | Total Count | Max Block Count | Function | Category |
|
||||
|------|------------|-----------------|----------|----------|
|
||||
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
|
||||
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
|
||||
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
|
||||
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
|
||||
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
|
||||
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
|
||||
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
|
||||
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
|
||||
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
|
||||
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
|
||||
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
|
||||
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
|
||||
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
|
||||
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
|
||||
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
|
||||
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
|
||||
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
|
||||
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
|
||||
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
|
||||
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
|
||||
|
||||
## 5. Hotspot Category Breakdown
|
||||
|
||||
Top 20 functions account for ~98% of total execution counts:
|
||||
|
||||
| Category | Functions | Combined Count | Share |
|
||||
|----------|-----------|---------------|-------|
|
||||
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
|
||||
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
|
||||
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
|
||||
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
|
||||
| Dissipation | kodis_ | ~92M | ~2% |
|
||||
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
|
||||
|
||||
## 6. Conclusions
|
||||
|
||||
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
|
||||
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
|
||||
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
|
||||
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
|
||||
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
|
||||
|
||||
## 7. PGO Phase 2 Usage
|
||||
|
||||
To apply the profile, use the following flags in `makefile.inc`:
|
||||
|
||||
```makefile
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
||||
-align array64byte -fpp -I${MKLROOT}/include
|
||||
```
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user