Compare commits

...

26 Commits

Author SHA1 Message Date
9c31384b2f Add optional BSSN kernel profiling switches 2026-04-13 16:51:06 +08:00
e4e741caa1 Remove dead chi derivative setup in BSSN RHS 2026-04-13 15:55:43 +08:00
65e0f95f40 Localize chi Ricci intermediates in RHS 2026-04-13 15:14:31 +08:00
f9fbf97e64 Elide dead stores in BSSN RHS hot path 2026-04-13 15:10:22 +08:00
968522995b Add fine-grained step timing and trim BH RHS overhead 2026-04-13 14:50:55 +08:00
f3988ac8ca Merge wave and mass extraction interpolation 2026-04-13 13:17:36 +08:00
e4c25eb21f Cache wave extraction angular kernels 2026-04-13 12:40:20 +08:00
4b10519876 Reuse mass integrand across detector radii 2026-04-13 11:55:41 +08:00
3a58273501 Batch constraint norm reductions 2026-04-13 11:48:02 +08:00
5c65cea2f0 Optimize constraint refresh after regrid 2026-04-13 11:39:50 +08:00
8c1f4d8108 迁移C算子的循环融合和临时量消除 2026-03-03 16:20:15 +08:00
d310ef918b bssn_rhs(fortran): migrate C kernel loop-fusion optimizations 2026-03-03 16:20:15 +08:00
b35e1b289f 设置开关关闭内存打印统计 2026-03-03 16:17:47 +08:00
05851b2c59 关闭静态负载 2026-03-03 16:17:47 +08:00
3b39583d67 fix(bssn_rhs) 2026-03-03 16:06:33 +08:00
688bdb6708 Merge pull request 'cjy-dystopia' (#3) from cjy-dystopia into main
Reviewed-on: #3
2026-03-02 21:36:26 +08:00
5070134857 perf(transfer_cached): 将 per-call new/delete 的 req_node/req_is_recv/completed 数组移入 SyncCache 复用
避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-02 21:14:35 +08:00
4012e9d068 perf(RestrictProlong): 用 Restrict_cached/OutBdLow2Hi_cached 替换非缓存版本,Sync_finish 改为渐进式解包
- RestrictProlong/RestrictProlong_aux 中的 Restrict() 和 OutBdLow2Hi() 替换为 _cached 版本,
  复用 gridseg 列表和 MPI 缓冲区,避免每次调用重新分配
- 新增 sync_cache_restrict/sync_cache_outbd 两组 per-level 缓存
- Sync_finish 从 MPI_Waitall 改为 MPI_Waitsome 渐进式解包,降低尾延迟
- AsyncSyncState 扩展 req_node/req_is_recv/pending_recv 字段支持渐进解包

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-02 20:48:38 +08:00
b3c367f15b prolong3 改为先算实际 stencil 窗口;只有窗口触及对称边界时才走全域 symmetry_bd,否则只复制必需窗口。restrict3 同样改成窗口判定,无触边时仅填 ii/jj/kk 必需窗口。 2026-03-02 17:38:56 +08:00
e73911f292 perf(restrict3): shrink X-pass ii sweep to required overlap window
- compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi
 - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes
 - keep stencil math unchanged; add bounds sanity check for ii window
2026-03-02 17:37:13 +08:00
7543d3e8c7 perf(MPatch): 用空间 bin 索引加速 Interp_Points 的 block 归属查找
- 为 Patch::Interp_Points 三个重载引入 BlockBinIndex(候选筛选 + 全扫回退)
  - 保持原 point-in-block 判定与后续插值/通信流程不变
  - 将逐点线性扫块从 O(N_points*N_blocks) 降为近似 O(N_points*k)
  - 测试:bin 上限如果太大,会引入不必要的索引构建开销。将 bins 上限设为 16。

Co-authored-by: gpt-5.3-codex
2026-03-02 17:37:13 +08:00
42c69fab24 refactor(Parallel): streamline MPI communication by consolidating request handling and memory management 2026-03-02 17:37:13 +08:00
95220a05c8 optimize fdderivs core-region branch elimination for ghost_width=3 2026-03-02 17:33:26 +08:00
466b084a58 fix prolong/restrict index bounds after cherry-pick 12e1f63 2026-03-02 13:59:47 +08:00
61ccef9f97 prolong3: 减少Z-pass 冗余计算 2026-03-02 13:58:52 +08:00
524d1d1512 Merge pull request 'cjy-dystopia' (#2) from cjy-dystopia into main
Reviewed-on: #2
2026-03-01 19:22:09 +08:00
19 changed files with 3944 additions and 1947 deletions

View File

@@ -7,6 +7,7 @@
#include <string>
#include <cmath>
#include <new>
#include <vector>
using namespace std;
#include "misc.h"
@@ -17,6 +18,168 @@ using namespace std;
#include "interp_lb_profile.h"
#endif
namespace
{
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
};
struct BlockBinIndex
{
int bins[dim];
double lo[dim];
double inv[dim];
vector<InterpBlockView> views;
vector<vector<int>> bin_to_blocks;
bool valid;
BlockBinIndex() : valid(false)
{
for (int i = 0; i < dim; i++)
{
bins[i] = 1;
lo[i] = 0.0;
inv[i] = 0.0;
}
}
};
inline int clamp_int(int v, int lo, int hi)
{
return (v < lo) ? lo : ((v > hi) ? hi : v);
}
inline int coord_to_bin(double x, double lo, double inv, int nb)
{
if (nb <= 1 || inv <= 0.0)
return 0;
int b = int(floor((x - lo) * inv));
return clamp_int(b, 0, nb - 1);
}
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
{
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
}
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
{
for (int i = 0; i < dim; i++)
{
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
return false;
}
return true;
}
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
{
index = BlockBinIndex();
MyList<Block> *Bp = patch->blb;
while (Bp)
{
Block *BP = Bp->data;
InterpBlockView view;
view.bp = BP;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
index.views.push_back(view);
if (Bp == patch->ble)
break;
Bp = Bp->next;
}
const int nblocks = int(index.views.size());
if (nblocks <= 0)
return;
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
bins_1d = clamp_int(bins_1d, 1, 32);
for (int i = 0; i < dim; i++)
{
index.bins[i] = bins_1d;
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
if (hi > index.lo[i] && bins_1d > 1)
index.inv[i] = bins_1d / (hi - index.lo[i]);
else
index.inv[i] = 0.0;
}
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
for (int bi = 0; bi < nblocks; bi++)
{
const InterpBlockView &view = index.views[bi];
int bmin[dim], bmax[dim];
for (int d = 0; d < dim; d++)
{
const double low = view.llb[d] - DH[d] / 2;
const double up = view.uub[d] + DH[d] / 2;
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
if (bmax[d] < bmin[d])
{
int t = bmin[d];
bmin[d] = bmax[d];
bmax[d] = t;
}
}
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
for (int by = bmin[1]; by <= bmax[1]; by++)
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
}
index.valid = true;
}
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{
if (!index.valid)
return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
for (size_t ci = 0; ci < cand.size(); ci++)
{
const int bi = cand[ci];
if (point_in_block_view(index.views[bi], pox, DH))
return bi;
}
// Fallback to full scan for numerical edge cases around bin boundaries.
for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH))
return int(bi);
return -1;
}
} // namespace
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{
@@ -367,9 +530,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim];
double DH[dim];
for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
@@ -392,57 +557,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
MyList<Block> *Bp = blb;
bool notfind = true;
while (notfind && Bp) // run along Blocks
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = Bp->data;
bool flag = true;
for (int i = 0; i < dim; i++)
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
flag = false;
break;
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
@@ -535,9 +667,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim];
double DH[dim];
for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
@@ -561,56 +695,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
MyList<Block> *Bp = blb;
bool notfind = true;
while (notfind && Bp)
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = Bp->data;
bool flag = true;
for (int i = 0; i < dim; i++)
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
varl = VarList;
int k = 0;
while (varl)
{
flag = false;
break;
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
@@ -833,9 +934,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group);
double DH[dim], llb[dim], uub[dim];
double DH[dim];
for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points
{
@@ -858,57 +961,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
MyList<Block> *Bp = blb;
bool notfind = true;
while (notfind && Bp) // run along Blocks
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
Block *BP = Bp->data;
bool flag = true;
for (int i = 0; i < dim; i++)
Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
flag = false;
break;
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}

View File

@@ -3893,66 +3893,105 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
int node;
MPI_Request *reqs;
MPI_Status *stats;
reqs = new MPI_Request[2 * cpusize];
stats = new MPI_Status[2 * cpusize];
MPI_Request *reqs = new MPI_Request[2 * cpusize];
MPI_Status *stats = new MPI_Status[2 * cpusize];
int *req_node = new int[2 * cpusize];
int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0;
int pending_recv = 0;
double **send_data, **rec_data;
send_data = new double *[cpusize];
rec_data = new double *[cpusize];
int length;
double **send_data = new double *[cpusize];
double **rec_data = new double *[cpusize];
int *send_lengths = new int[cpusize];
int *recv_lengths = new int[cpusize];
for (node = 0; node < cpusize; node++)
{
send_data[node] = rec_data[node] = 0;
if (node == myrank)
send_lengths[node] = recv_lengths[node] = 0;
}
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
}
else
}
// Local transfer on this rank.
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{
// send from this cpu to cpu#node
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{
send_data[node] = new double[length];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
// receive from cpu#node to this cpu
if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
int recv_node = req_node[idx];
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
pending_recv--;
}
}
}
// wait for all requests to complete
MPI_Waitall(req_no, reqs, stats);
for (node = 0; node < cpusize; node++)
if (rec_data[node])
data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[myrank])
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++)
{
@@ -3964,8 +4003,13 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
delete[] reqs;
delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
//
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
@@ -3978,66 +4022,105 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
int node;
MPI_Request *reqs;
MPI_Status *stats;
reqs = new MPI_Request[2 * cpusize];
stats = new MPI_Status[2 * cpusize];
MPI_Request *reqs = new MPI_Request[2 * cpusize];
MPI_Status *stats = new MPI_Status[2 * cpusize];
int *req_node = new int[2 * cpusize];
int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0;
int pending_recv = 0;
double **send_data, **rec_data;
send_data = new double *[cpusize];
rec_data = new double *[cpusize];
int length;
double **send_data = new double *[cpusize];
double **rec_data = new double *[cpusize];
int *send_lengths = new int[cpusize];
int *recv_lengths = new int[cpusize];
for (node = 0; node < cpusize; node++)
{
send_data[node] = rec_data[node] = 0;
if (node == myrank)
send_lengths[node] = recv_lengths[node] = 0;
}
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
}
else
}
// Local transfer on this rank.
recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{
// send from this cpu to cpu#node
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{
send_data[node] = new double[length];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
// receive from cpu#node to this cpu
if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
int recv_node = req_node[idx];
data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
pending_recv--;
}
}
}
// wait for all requests to complete
MPI_Waitall(req_no, reqs, stats);
for (node = 0; node < cpusize; node++)
if (rec_data[node])
data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[myrank])
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++)
{
@@ -4049,8 +4132,13 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
delete[] reqs;
delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
@@ -4232,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
lengths_valid(false)
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
{
}
// SyncCache invalidate: free grid segment lists but keep buffers
@@ -4271,11 +4359,15 @@ void Parallel::SyncCache::destroy()
if (recv_bufs) delete[] recv_bufs;
if (reqs) delete[] reqs;
if (stats) delete[] stats;
if (tc_req_node) delete[] tc_req_node;
if (tc_req_is_recv) delete[] tc_req_is_recv;
if (tc_completed) delete[] tc_completed;
combined_src = combined_dst = 0;
send_lengths = recv_lengths = 0;
send_buf_caps = recv_buf_caps = 0;
send_bufs = recv_bufs = 0;
reqs = 0; stats = 0;
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
cpusize = 0; max_reqs = 0;
}
// transfer_cached: reuse pre-allocated buffers from SyncCache
@@ -4289,64 +4381,96 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
int cpusize = cache.cpusize;
int req_no = 0;
int pending_recv = 0;
int node;
int *req_node = cache.tc_req_node;
int *req_is_recv = cache.tc_req_is_recv;
int *completed = cache.tc_completed;
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank)
if (node == myrank) continue;
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
{
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = length;
if (length > 0)
if (rlength > cache.recv_buf_caps[node])
{
if (length > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
}
else
}
// Local transfer on this rank.
int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[myrank] = self_len;
if (self_len > 0)
{
if (self_len > cache.recv_buf_caps[myrank])
{
// send
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
cache.recv_bufs[myrank] = new double[self_len];
cache.recv_buf_caps[myrank] = self_len;
}
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{
if (slength > cache.send_buf_caps[node])
{
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
// recv
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{
if (rlength > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
int recv_node_i = req_node[idx];
data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
pending_recv--;
}
}
}
MPI_Waitall(req_no, cache.reqs, cache.stats);
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
for (node = 0; node < cpusize; node++)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (self_len > 0)
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
{
if (!cache.valid)
@@ -4374,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
for (int node = 0; node < cpusize; node++)
@@ -4474,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
for (int node = 0; node < cpusize; node++)
@@ -4544,6 +4674,11 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
int cpusize = cache.cpusize;
state.req_no = 0;
state.active = true;
state.pending_recv = 0;
// Allocate tracking arrays
delete[] state.req_node; delete[] state.req_is_recv;
state.req_node = new int[cache.max_reqs];
state.req_is_recv = new int[cache.max_reqs];
MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst;
@@ -4588,6 +4723,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 0;
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
int rlength;
@@ -4605,29 +4742,60 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 1;
state.pending_recv++;
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
}
}
cache.lengths_valid = true;
}
// Sync_finish: wait for async MPI operations and unpack
// Sync_finish: progressive unpack as receives complete, then wait for sends
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry)
{
if (!state.active)
return;
MPI_Waitall(state.req_no, cache.reqs, cache.stats);
int cpusize = cache.cpusize;
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst;
for (int node = 0; node < cpusize; node++)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
// Unpack local data first (no MPI needed)
if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
// Progressive unpack of remote receives
if (state.pending_recv > 0 && state.req_no > 0)
{
int pending = state.pending_recv;
int *completed = new int[cache.max_reqs];
while (pending > 0)
{
int outcount = 0;
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && state.req_is_recv[idx])
{
int recv_node = state.req_node[idx];
data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
pending--;
}
}
}
delete[] completed;
}
// Wait for remaining sends
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
delete[] state.req_node; state.req_node = 0;
delete[] state.req_is_recv; state.req_is_recv = 0;
state.active = false;
}
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
@@ -5085,10 +5253,10 @@ void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
delete[] transfer_src;
delete[] transfer_dst;
}
double Parallel::L2Norm(Patch *Pat, var *vf)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double Parallel::L2Norm(Patch *Pat, var *vf)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf, dtvf = 0;
int BDW = ghost_width;
@@ -5113,13 +5281,48 @@ double Parallel::L2Norm(Patch *Pat, var *vf)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
tvf = sqrt(tvf);
return tvf;
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
return tvf;
}
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf, dtvf = 0;
int BDW = ghost_width;
@@ -5144,12 +5347,47 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
tvf = sqrt(tvf);
return tvf;
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{
int myrank = 0;
return tvf;
}
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == 0)
{
@@ -5694,6 +5932,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
@@ -5740,6 +5981,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5786,6 +6030,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5807,58 +6054,98 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
int cpusize = cache.cpusize;
int req_no = 0;
int pending_recv = 0;
int *req_node = new int[cache.max_reqs];
int *req_is_recv = new int[cache.max_reqs];
int *completed = new int[cache.max_reqs];
// Post receives first so peers can progress rendezvous early.
for (int node = 0; node < cpusize; node++)
{
if (node == myrank)
if (node == myrank) continue;
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
{
int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = length;
if (length > 0)
if (rlength > cache.recv_buf_caps[node])
{
if (length > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
}
else
}
// Local transfer on this rank.
int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[myrank] = self_len;
if (self_len > 0)
{
if (self_len > cache.recv_buf_caps[myrank])
{
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
cache.recv_bufs[myrank] = new double[self_len];
cache.recv_buf_caps[myrank] = self_len;
}
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (int node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{
if (slength > cache.send_buf_caps[node])
{
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{
if (rlength > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
int recv_node_i = req_node[idx];
data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
pending_recv--;
}
}
}
MPI_Waitall(req_no, cache.reqs, cache.stats);
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
for (int node = 0; node < cpusize; node++)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (self_len > 0)
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
}
// collect all buffer grid segments or blocks for given patch

View File

@@ -108,6 +108,9 @@ namespace Parallel
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
int *tc_req_node;
int *tc_req_is_recv;
int *tc_completed;
SyncCache();
void invalidate();
void destroy();
@@ -121,7 +124,10 @@ namespace Parallel
struct AsyncSyncState {
int req_no;
bool active;
AsyncSyncState() : req_no(0), active(false) {}
int *req_node;
int *req_is_recv;
int pending_recv;
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
@@ -173,12 +179,13 @@ namespace Parallel
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat,
@@ -210,11 +217,12 @@ namespace Parallel
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);

View File

@@ -3439,10 +3439,10 @@ void ShellPatch::write_Pablo_file_ss(int *ext, double xmin, double xmax, double
delete[] Z;
}
double ShellPatch::L2Norm(var *vf)
{
double tvf, dtvf = 0;
int BDW = overghost;
double ShellPatch::L2Norm(var *vf)
{
double tvf, dtvf = 0;
int BDW = overghost;
MyList<ss_patch> *sPp = PatL;
while (sPp)
@@ -3469,13 +3469,50 @@ double ShellPatch::L2Norm(var *vf)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
tvf = sqrt(tvf);
return tvf;
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf)
return tvf;
}
void ShellPatch::L2Norm7(var **vf, double *norms)
{
double tvf[7], dtvf[7];
int BDW = overghost;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<ss_patch> *sPp = PatL;
while (sPp)
{
MyList<Block> *Bp = sPp->data->blb;
while (Bp)
{
Block *cg = Bp->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (Bp == sPp->data->ble)
break;
Bp = Bp->next;
}
sPp = sPp->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf)
{
MyList<var> *varl;
int num_var = 0;

View File

@@ -195,10 +195,11 @@ public:
bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry);
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst);
double L2Norm(var *vf);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
};
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst);
double L2Norm(var *vf);
void L2Norm7(var **vf, double *norms);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
};
#endif /* SHELLPATCH_H */

File diff suppressed because it is too large Load Diff

View File

@@ -45,10 +45,11 @@ public:
int checkrun;
char checkfilename[50];
int Steps;
double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut;
double Courant;
double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut;
int *ConstraintRefreshLevels;
double Courant;
double numepss, numepsb, numepsh;
int Symmetry;
int maxl, decn;
@@ -130,10 +131,12 @@ public:
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor;
surface_integral *Waveshell;
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell;
checkpoint *CheckPoint;
public:

View File

@@ -59,9 +59,10 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
real*8,intent(in) :: eps
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong
integer::gont
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong
integer::gont
integer :: i,j,k
!~~~~~~> Other variables:
@@ -83,11 +84,18 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI
real*8 :: divb_loc,det_loc
real*8 :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
real*8 :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
real*8 :: fxx_loc,fxy_loc,fxz_loc
real*8 :: Gamxa_loc,Gamya_loc,Gamza_loc
real*8 :: f_loc,chin_loc
real*8 :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
double precision,parameter::FF = 0.75d0,eta=2.d0
real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -96,11 +104,11 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: reta
#endif
#if (GAUGE == 6 || GAUGE == 7)
integer :: BHN,i,j,k
real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2
#if (GAUGE == 6 || GAUGE == 7)
integer :: BHN
real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2
real*8, dimension(ex(1),ex(2),ex(3)) :: reta
call getpbh(BHN,Porg,Mass)
@@ -145,174 +153,204 @@
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
alpn1 = Lap + ONE
chin1 = chi + ONE
gxx = dxx + ONE
gyy = dyy + ONE
gzz = dzz + ONE
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
alpn1(i,j,k) = Lap(i,j,k) + ONE
chin1(i,j,k) = chi(i,j,k) + ONE
gxx(i,j,k) = dxx(i,j,k) + ONE
gyy(i,j,k) = dyy(i,j,k) + ONE
gzz(i,j,k) = dzz(i,j,k) + ONE
enddo
enddo
enddo
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
gyy_rhs = - TWO * alpn1 * Ayy - F2o3 * gyy * div_beta + &
TWO *( gxy * betaxy + gyy * betayy + gyz * betazy)
gzz_rhs = - TWO * alpn1 * Azz - F2o3 * gzz * div_beta + &
TWO *( gxz * betaxz + gyz * betayz + gzz * betazz)
gxy_rhs = - TWO * alpn1 * Axy + F1o3 * gxy * div_beta + &
gxx * betaxy + gxz * betazy + &
gyy * betayx + gyz * betazx &
- gxy * betazz
gyz_rhs = - TWO * alpn1 * Ayz + F1o3 * gyz * div_beta + &
gxy * betaxz + gyy * betayz + &
gxz * betaxy + gzz * betazy &
- gyz * betaxx
gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + &
gxx * betaxz + gxy * betayz + &
gyz * betayx + gzz * betazx &
- gxz * betayy !rhs for gij
! invert tilted metric
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
gupxx = ( gyy * gzz - gyz * gyz ) / gupzz
gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
gupxz = ( gxy * gyz - gyy * gxz ) / gupzz
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
if(co == 0)then
! Gam^i_Res = Gam^i + gup^ij_,j
Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
+gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
endif
! second kind of connection
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
! Raise indices of \tilde A_{ij} and store in R_ij
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
Ryy = gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
Rzz = gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
Rxy = gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
(gupxx * gupyy + gupxy * gupxy)* Axy + &
(gupxx * gupyz + gupxz * gupxy)* Axz + &
(gupxy * gupyz + gupxz * gupyy)* Ayz
Rxz = gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
(gupxx * gupyz + gupxy * gupxz)* Axy + &
(gupxx * gupzz + gupxz * gupxz)* Axz + &
(gupxy * gupzz + gupxz * gupyz)* Ayz
Ryz = gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
(gupxy * gupyz + gupyy * gupxz)* Axy + &
(gupxy * gupzz + gupyz * gupxz)* Axz + &
(gupyy * gupzz + gupyz * gupyz)* Ayz
! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + &
TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - &
gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + &
TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + &
TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - &
gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + &
TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + &
TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - &
gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + &
TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
div_beta(i,j,k) = divb_loc
chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
gupxx(i,j,k) = gupxx_loc
gupxy(i,j,k) = gupxy_loc
gupxz(i,j,k) = gupxz_loc
gupyy(i,j,k) = gupyy_loc
gupyz(i,j,k) = gupyz_loc
gupzz(i,j,k) = gupzz_loc
if(co == 0)then
Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
endif
Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
enddo
enddo
enddo
! Raise indices of \tilde A_{ij} and store in R_ij
! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
gupxx_loc = gupxx(i,j,k)
gupxy_loc = gupxy(i,j,k)
gupxz_loc = gupxz(i,j,k)
gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k)
Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
(gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
Rxx(i,j,k) = Rxx_loc
Ryy(i,j,k) = Ryy_loc
Rzz(i,j,k) = Rzz_loc
Rxy(i,j,k) = Rxy_loc
Rxz(i,j,k) = Rxz_loc
Ryz(i,j,k) = Ryz_loc
Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
enddo
enddo
enddo
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -321,38 +359,54 @@
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
fxx = gxxx + gxyy + gxzz
fxy = gxyx + gyyy + gyzz
fxz = gxzx + gyzy + gzzz
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
gupxx * gxxx + gupyy * gyyx + gupzz * gzzx + &
TWO * (gupxy * gxyx + gupxz * gxzx + gupyz * gyzx )
Gamy_rhs = Gamy_rhs + F2o3 * Gamya * div_beta - &
Gamxa * betayx - Gamya * betayy - Gamza * betayz + &
F1o3 * (gupxy * fxx + gupyy * fxy + gupyz * fxz ) + &
gupxx * gxxy + gupyy * gyyy + gupzz * gzzy + &
TWO * (gupxy * gxyy + gupxz * gxzy + gupyz * gyzy )
Gamz_rhs = Gamz_rhs + F2o3 * Gamza * div_beta - &
Gamxa * betazx - Gamya * betazy - Gamza * betazz + &
F1o3 * (gupxz * fxx + gupyz * fxy + gupzz * fxz ) + &
gupxx * gxxz + gupyy * gyyz + gupzz * gzzz + &
TWO * (gupxy * gxyz + gupxz * gxzz + gupyz * gyzz ) !rhs for Gam^i
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
divb_loc = div_beta(i,j,k)
fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
gupxx_loc = gupxx(i,j,k)
gupxy_loc = gupxy(i,j,k)
gupxz_loc = gupxz(i,j,k)
gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k)
Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
Gamxa(i,j,k) = Gamxa_loc
Gamya(i,j,k) = Gamya_loc
Gamza(i,j,k) = Gamza_loc
Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
enddo
enddo
enddo
!first kind of connection stored in gij,k
gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -601,192 +655,190 @@
Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + &
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz )
! Add chi part to Ricci tensor:
Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev)
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
! now get physical second kind of connection
Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF
Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF
Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF
Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF
Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF
Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF
Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF
Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF
Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF
Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF
Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF
Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF
Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF
Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF
fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
! store D^i D_i Lap in trK_rhs upto chi
trK_rhs = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
#if 1
!! follow bam code
S = chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
f = F2o3 * trK * trK -(&
gupxx * ( &
gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
gupyy * ( &
gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
gupzz * ( &
gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
TWO * ( &
gupxy * ( &
gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy * (Axx * Ayy + Axy * Axy) + &
gupxz * (Axx * Ayz + Axz * Axy) + &
gupyz * (Axy * Ayz + Axz * Ayy) ) + &
gupxz * ( &
gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxy * (Axx * Ayz + Axy * Axz) + &
gupxz * (Axx * Azz + Axz * Axz) + &
gupyz * (Axy * Azz + Axz * Ayz) ) + &
gupyz * ( &
gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxy * (Axy * Ayz + Ayy * Axz) + &
gupxz * (Axy * Azz + Ayz * Axz) + &
gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
#else
! Add lapse and S_ij parts to Ricci tensor:
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
! Compute trace-free part (note: chi^-1 and chi cancel!):
f = F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
#endif
Axx_rhs = fxx - gxx * f
Ayy_rhs = fyy - gyy * f
Azz_rhs = fzz - gzz * f
Axy_rhs = fxy - gxy * f
Axz_rhs = fxz - gxz * f
Ayz_rhs = fyz - gyz * f
! Now: store A_il A^l_j into fij:
fxx = gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
fyy = gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
fzz = gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
fxy = gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy *(Axx * Ayy + Axy * Axy) + &
gupxz *(Axx * Ayz + Axz * Axy) + &
gupyz *(Axy * Ayz + Axz * Ayy)
fxz = gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxy *(Axx * Ayz + Axy * Axz) + &
gupxz *(Axx * Azz + Axz * Axz) + &
gupyz *(Axy * Azz + Axz * Ayz)
fyz = gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxy *(Axy * Ayz + Ayy * Axz) + &
gupxz *(Axy * Azz + Ayz * Axz) + &
gupyz *(Ayy * Azz + Ayz * Ayz)
f = chin1
! store D^i D_i Lap in trK_rhs
trK_rhs = f*trK_rhs
Axx_rhs = f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx) + &
TWO * ( Axx * betaxx + Axy * betayx + Axz * betazx )- &
F2o3 * Axx * div_beta
Ayy_rhs = f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy) + &
TWO * ( Axy * betaxy + Ayy * betayy + Ayz * betazy )- &
F2o3 * Ayy * div_beta
Azz_rhs = f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz) + &
TWO * ( Axz * betaxz + Ayz * betayz + Azz * betazz )- &
F2o3 * Azz * div_beta
Axy_rhs = f * Axy_rhs+ alpn1 *( trK * Axy - TWO * fxy )+ &
Axx * betaxy + Axz * betazy + &
Ayy * betayx + Ayz * betazx + &
F1o3 * Axy * div_beta - Axy * betazz
Ayz_rhs = f * Ayz_rhs+ alpn1 *( trK * Ayz - TWO * fyz )+ &
Axy * betaxz + Ayy * betayz + &
Axz * betaxy + Azz * betazy + &
F1o3 * Ayz * div_beta - Ayz * betaxx
Axz_rhs = f * Axz_rhs+ alpn1 *( trK * Axz - TWO * fxz )+ &
Axx * betaxz + Axy * betayz + &
Ayz * betayx + Azz * betazx + &
F1o3 * Axz * div_beta - Axz * betayy !rhs for Aij
! Compute trace of S_ij
S = f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK + &
gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
FOUR * PI * ( rho + S )) !rhs for trK
!covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
chin_loc = chin1(i,j,k)
f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
f(i,j,k) = f_loc
Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
enddo
enddo
enddo
! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
chin_loc = chin1(i,j,k)
gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxx(i,j,k) = Gamyxx(i,j,k) - ( - gxx(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxx(i,j,k) = Gamzxx(i,j,k) - ( - gxx(i,j,k) * gxxz(i,j,k) )*HALF
Gamxyy(i,j,k) = Gamxyy(i,j,k) - ( - gyy(i,j,k) * gxxx(i,j,k) )*HALF
Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
Gamzyy(i,j,k) = Gamzyy(i,j,k) - ( - gyy(i,j,k) * gxxz(i,j,k) )*HALF
Gamxzz(i,j,k) = Gamxzz(i,j,k) - ( - gzz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyzz(i,j,k) = Gamyzz(i,j,k) - ( - gzz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxy(i,j,k) = Gamzxy(i,j,k) - ( - gxy(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxz(i,j,k) = Gamyxz(i,j,k) - ( - gxz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxyz(i,j,k) = Gamxyz(i,j,k) - ( - gyz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
enddo
enddo
enddo
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
divb_loc = div_beta(i,j,k)
chin_loc = chin1(i,j,k)
S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
S(i,j,k) = S_loc
f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
alpn1(i,j,k)/chin_loc * f_loc )
f(i,j,k) = f_loc
l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
F2o3 * Axx(i,j,k) * divb_loc
Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
F2o3 * Ayy(i,j,k) * divb_loc
Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
F2o3 * Azz(i,j,k) * divb_loc
Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
FOUR * PI * (rho(i,j,k) + S_loc) )
enddo
enddo
enddo
!!!! gauge variable part
@@ -948,15 +1000,15 @@
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)

View File

@@ -22,19 +22,32 @@
#define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
#define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR
#endif
#ifdef fortran3
#define f_compute_rhs_bssn compute_rhs_bssn_
#ifdef fortran3
#define f_compute_rhs_bssn compute_rhs_bssn_
#define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
#define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
#define f_compute_rhs_Z4c compute_rhs_z4c_
#define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_
#endif
extern "C"
{
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
#define f_compute_constraint_fr compute_constraint_fr_
#endif
#ifdef __cplusplus
extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C"
{
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij

View File

@@ -2,12 +2,88 @@
#include "bssn_rhs.h"
#include "share_func.h"
#include "tool.h"
#include <time.h>
// 0-based i,j,k
// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
// ex(1)=nx, ex(2)=ny, ex(3)=nz
// 用法a[ IDX_F(i,j,k,nx,ny) ]
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing
{
enum Bucket
{
KB_SETUP_DERIVS = 0,
KB_GEOM_GAMMA,
KB_RICCI_METRIC,
KB_CHI_LAPSE,
KB_AIJ_TRK_GAUGE,
KB_KO_CONSTRAINT,
KB_COUNT
};
static double local_bucket_seconds[KB_COUNT];
static const char *bucket_labels[KB_COUNT] =
{
"setup_derivs",
"geom_gamma",
"ricci_metric",
"chi_lapse",
"aij_trk_gauge",
"ko_constraint"
};
static inline double now_seconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec);
}
}
extern "C" void f_bssn_rhs_kernel_timing_reset()
{
for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i)
rhs_kernel_timing::local_bucket_seconds[i] = 0.0;
}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count()
{
return rhs_kernel_timing::KB_COUNT;
}
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds()
{
return rhs_kernel_timing::local_bucket_seconds;
}
extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index)
{
if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT)
return "unknown";
return rhs_kernel_timing::bucket_labels[bucket_index];
}
#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds()
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \
rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \
rhs_kernel_timing::now_seconds() - (var_name)
#else
extern "C" void f_bssn_rhs_kernel_timing_reset() {}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; }
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; }
extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; }
#define RHS_KERNEL_TIMER_DECL(var_name)
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name)
#endif
// C function that calculates the right-hand side for BSSN equations
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
@@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0];
dZ = Z[1] - Z[0];
RHS_KERNEL_TIMER_DECL(timer_setup_derivs);
// 1ms //
for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0;
@@ -141,6 +218,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
}
RHS_KERNEL_TIMER_ADD(KB_SETUP_DERIVS, timer_setup_derivs);
RHS_KERNEL_TIMER_DECL(timer_geom_gamma);
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
for(int i=0;i<all;i+=1){
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
@@ -283,9 +362,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
@@ -315,6 +391,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
);
}
RHS_KERNEL_TIMER_ADD(KB_GEOM_GAMMA, timer_geom_gamma);
RHS_KERNEL_TIMER_DECL(timer_ricci_metric);
// 22.3ms //
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
@@ -332,7 +410,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
@@ -686,69 +763,74 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i]
);
}
RHS_KERNEL_TIMER_ADD(KB_RICCI_METRIC, timer_ricci_metric);
RHS_KERNEL_TIMER_DECL(timer_chi_lapse);
// 22.3ms //
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 7ms //
for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
fyy[i] = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
fyz[i] = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
fzz[i] = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
f[i] =
gupxx[i] * (fxx[i] - (F3o2 / chin1[i]) * chix[i] * chix[i])
+ gupyy[i] * (fyy[i] - (F3o2 / chin1[i]) * chiy[i] * chiy[i])
+ gupzz[i] * (fzz[i] - (F3o2 / chin1[i]) * chiz[i] * chiz[i])
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO);
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO);
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO);
const double inv_chin1 = ONE / chin1[i];
const double half_inv_chin1 = HALF * inv_chin1;
const double scaled_inv = F3o2 * inv_chin1;
const double cxx = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
const double cxy = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
const double cxz = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
const double cyy = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
const double cyz = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
const double czz = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
const double ricci_chi =
gupxx[i] * (cxx - scaled_inv * chix[i] * chix[i])
+ gupyy[i] * (cyy - scaled_inv * chiy[i] * chiy[i])
+ gupzz[i] * (czz - scaled_inv * chiz[i] * chiz[i])
+ TWO * gupxy[i] * (cxy - scaled_inv * chix[i] * chiy[i])
+ TWO * gupxz[i] * (cxz - scaled_inv * chix[i] * chiz[i])
+ TWO * gupyz[i] * (cyz - scaled_inv * chiy[i] * chiz[i]);
f[i] = ricci_chi;
Rxx[i] = Rxx[i] + ( cxx - half_inv_chin1 * chix[i] * chix[i] + (dxx[i] + ONE) * ricci_chi ) * half_inv_chin1;
Ryy[i] = Ryy[i] + ( cyy - half_inv_chin1 * chiy[i] * chiy[i] + (dyy[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rzz[i] = Rzz[i] + ( czz - half_inv_chin1 * chiz[i] * chiz[i] + (dzz[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
Ryz[i] = Ryz[i] + ( fyz[i] - (chiy[i] * chiz[i]) / (chin1[i] * TWO) + gyz[i] * f[i] ) / (chin1[i] * TWO);
Rxy[i] = Rxy[i] + ( cxy - half_inv_chin1 * chix[i] * chiy[i] + gxy[i] * ricci_chi ) * half_inv_chin1;
Rxz[i] = Rxz[i] + ( cxz - half_inv_chin1 * chix[i] * chiz[i] + gxz[i] * ricci_chi ) * half_inv_chin1;
Ryz[i] = Ryz[i] + ( cyz - half_inv_chin1 * chiy[i] * chiz[i] + gyz[i] * ricci_chi ) * half_inv_chin1;
}
// 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms //
for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i];
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i];
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];
const double inv_chin1 = ONE / chin1[i];
const double gchi_x = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) * inv_chin1;
const double gchi_y = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) * inv_chin1;
const double gchi_z = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) * inv_chin1;
/* Christoffel 修正项 */
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF;
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) * inv_chin1) - (dxx[i] + ONE) * gchi_x ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_y ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_z ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_x ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) * inv_chin1) - (dyy[i] + ONE) * gchi_y ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_z ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_x ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_y ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) * inv_chin1) - (dzz[i] + ONE) * gchi_z ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gxxz[i] ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] * inv_chin1) - gxy[i] * gchi_x ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] * inv_chin1) - gxy[i] * gchi_y ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gchi_z ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] / chin1[i]) - gxz[i] * gxxx[i] ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gxxy[i] ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] / chin1[i]) - gxz[i] * gxxz[i] ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] * inv_chin1) - gxz[i] * gchi_x ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gchi_y ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] * inv_chin1) - gxz[i] * gchi_z ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gxxx[i] ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] / chin1[i]) - gyz[i] * gxxy[i] ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] / chin1[i]) - gyz[i] * gxxz[i] ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gchi_x ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] * inv_chin1) - gyz[i] * gchi_y ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] * inv_chin1) - gyz[i] * gchi_z ) * HALF;
/* fxx..fyz 修正:减去 Γ * ∂Lap */
fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i];
@@ -762,6 +844,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
}
RHS_KERNEL_TIMER_ADD(KB_CHI_LAPSE, timer_chi_lapse);
RHS_KERNEL_TIMER_DECL(timer_aij_trk_gauge);
// 2.5ms //
for (int i=0;i<all;i+=1) {
const double divb = betaxx[i] + betayy[i] + betazz[i];
@@ -1062,6 +1146,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#endif
}
RHS_KERNEL_TIMER_ADD(KB_AIJ_TRK_GAUGE, timer_aij_trk_gauge);
RHS_KERNEL_TIMER_DECL(timer_ko_constraint);
// advection + KO dissipation with shared symmetry buffer
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
@@ -1139,60 +1225,61 @@ int f_compute_rhs_bssn(int *ex, double &T,
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
}
// 7ms //
for (int i=0;i<all;i+=1) {
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
// 7ms //
for (int i=0;i<all;i+=1) {
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
}
}
RHS_KERNEL_TIMER_ADD(KB_KO_CONSTRAINT, timer_ko_constraint);

View File

@@ -33,7 +33,7 @@
real*8 :: dX,dY,dZ
real*8,dimension(0:ex(1),0:ex(2),0:ex(3)) :: fh
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -137,7 +137,7 @@
real*8 :: dX
real*8,dimension(0:ex(1),0:ex(2),0:ex(3)) :: fh
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d2dx
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -1512,8 +1512,9 @@
real*8 :: dX,dY,dZ
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1, F9=9.d0, F45=4.5d1
@@ -1560,17 +1561,55 @@
fxx = ZEO
fyy = ZEO
fzz = ZEO
fxy = ZEO
fxz = ZEO
fyz = ZEO
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
!~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then
!
fzz = ZEO
fxy = ZEO
fxz = ZEO
fyz = ZEO
i_core_min = max(1, imin+2)
i_core_max = min(ex(1), imax-2)
j_core_min = max(1, jmin+2)
j_core_max = min(ex(2), jmax-2)
k_core_min = max(1, kmin+2)
k_core_max = min(ex(3), kmax-2)
if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
do k=k_core_min,k_core_max
do j=j_core_min,j_core_max
do i=i_core_min,i_core_max
! interior points always use 4th-order stencils without branch checks
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
enddo
enddo
enddo
endif
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i>=i_core_min .and. i<=i_core_max .and. &
j>=j_core_min .and. j<=j_core_max .and. &
k>=k_core_min .and. k<=k_core_max) cycle
!~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then
!
! - f(i-2) + 16 f(i-1) - 30 f(i) + 16 f(i+1) - f(i+2)
! fxx(i) = ----------------------------------------------------------
! 12 dx^2

View File

@@ -1511,13 +1511,88 @@ deallocate(f_flat)
f_out = f_out*dX*dY*dZ
return
end subroutine l2normhelper
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw,ogw,Symmetry)
return
end subroutine l2normhelper
!--------------------------------------------------------------------------------------
subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f1,f2,f3,f4,f5,f6,f7,f_out,gw)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw,ogw,Symmetry)
implicit none
!~~~~~~> Input parameters:

View File

@@ -12,9 +12,10 @@
#define f_global_interpind global_interpind
#define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper
#define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_l2normhelper l2normhelper
#define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average
#define f_average3 average3
#define f_average2 average2
@@ -41,9 +42,10 @@
#define f_global_interpind GLOBAL_INTERPIND
#define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER
#define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_l2normhelper L2NORMHELPER
#define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE
#define f_average3 AVERAGE3
#define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
#define f_global_interpind global_interpind_
#define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_
#define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_l2normhelper l2normhelper_
#define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_
#define f_average3 average3_
#define f_average2 average2_
@@ -156,20 +159,29 @@ extern "C"
int *, double *, int &, int &);
}
extern "C"
{
void f_l2normhelper(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &, int &, int &);
extern "C"
{
void f_l2normhelper(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &);
}
extern "C"
{
void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &, int &, int &);
}
extern "C"

View File

@@ -29,6 +29,16 @@
#define REGLEV 0
#define BSSN_FINE_TIMING 0
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU
//#define CHECKDETAIL
@@ -88,6 +98,21 @@
// 0: for every level;
// 1: for all
//
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU
// use gpu or not
//
@@ -142,4 +167,3 @@
#define TINY 1e-10
#endif /* MICRODEF_H */

View File

@@ -64,8 +64,8 @@ lopsided_c.o: lopsided_c.C
lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata

View File

@@ -1956,10 +1956,13 @@
real*8,dimension(3) :: CD,FD
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
real*8 :: v1, v2, v3, v4, v5, v6
integer :: ic, jc, kc, ix_offset,ix,iy,iz
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
logical :: need_full_symmetry
real*8 :: res_line
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei
@@ -2062,17 +2065,60 @@
endif
enddo
maxcx = maxval(cix(imino:imaxo))
maxcy = maxval(ciy(jmino:jmaxo))
maxcz = maxval(ciz(kmino:kmaxo))
ic_min = minval(cix(imino:imaxo))
ic_max = maxval(cix(imino:imaxo))
jc_min = minval(ciy(jmino:jmaxo))
jc_max = maxval(ciy(jmino:jmaxo))
kc_min = minval(ciz(kmino:kmaxo))
kc_max = maxval(ciz(kmino:kmaxo))
maxcx = ic_max
maxcy = jc_max
maxcz = kc_max
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
write(*,*)"error in prolong"
return
endif
call symmetry_bd(3,extc,func,funcc,SoA)
i_lo = ic_min - 2
i_hi = ic_max + 3
j_lo = jc_min - 2
j_hi = jc_max + 3
k_lo = kc_min - 2
k_hi = kc_max + 3
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(3,extc,func,funcc,SoA)
else
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
endif
! 对每个 kpz, kc 固定)预计算 Z 向插值的 2D 切片
do k = kmino, kmaxo
pz = piz(k); kc = ciz(k)
! --- Pass 1: Z 方向,只算一次 ---
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
end do
end do
do j = jmino, jmaxo
py = piy(j); jc = ciy(j)
! --- Pass 2: Y 方向 ---
do ii = ic_min-2, ic_max+3
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
end do
! --- Pass 3: X 方向 ---
do i = imino, imaxo
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
end do
end do
end do
!~~~~~~> prolongation start...
#if 0
do k = kmino, kmaxo
pz = piz(k)
kc = ciz(k)
@@ -2106,28 +2152,7 @@
#if 0
! 1. 【降维Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值
! 结果存入 tmp_yz(x_index, y_offset)
do jj = 1, 6
iy = jc - 3 + jj
do ii = 1, extc(1)
tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + &
WC(2,pz)*funcc(ii, iy, kc-1) + &
WC(3,pz)*funcc(ii, iy, kc ) + &
WC(4,pz)*funcc(ii, iy, kc+1) + &
WC(5,pz)*funcc(ii, iy, kc+2) + &
WC(6,pz)*funcc(ii, iy, kc+3)
end do
end do
! 2. 【降维Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值
do ii = 1, extc(1)
tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + &
WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + &
WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6)
end do
#endif
! 3. 【降维X 向】最后在最内层只处理 X 方向的 6 点加权
! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
do i = imino, imaxo
@@ -2145,7 +2170,7 @@
end do
end do
end do
#endif
return
end subroutine prolong3
@@ -2344,7 +2369,14 @@
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
real*8,dimension(3) :: CD,FD
real*8 :: tmp_xz_plane(-1:extf(1), 6)
real*8 :: tmp_x_line(-1:extf(1))
integer :: fi, fj, fk, ii, jj, kk
integer :: fi_min, fi_max, ii_lo, ii_hi
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
logical :: need_full_symmetry
if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei
@@ -2423,9 +2455,86 @@
stop
endif
call symmetry_bd(2,extf,funf,funff,SoA)
! 仅计算 X 向最终写回所需的窗口:
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
ii_lo = fi_min - 2
ii_hi = fi_max + 3
jj_lo = fj_min - 2
jj_hi = fj_max + 3
kk_lo = fk_min - 2
kk_hi = fk_max + 3
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
jj_lo < -1 .or. jj_hi > extf(2) .or. &
kk_lo < -1 .or. kk_hi > extf(3))then
write(*,*)"restrict3: invalid stencil window"
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
write(*,*)"extf=",extf
stop
endif
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(2,extf,funf,funff,SoA)
else
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
endif
!~~~~~~> restriction start...
do k = kmino, kmaxo
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
do j = jmino, jmaxo
fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
! 确保 ii 循环是最内层且连续访问
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1))
tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1))
tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + &
C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + &
C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1))
tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1))
tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1))
tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1))
end do
! 优化点 2: 同样向量化 Y 方向压缩
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
end do
! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据
do i = imino, imaxo
fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
C3*(tmp_x_line(fi ) + tmp_x_line(fi+1))
end do
end do
end do
#if 0
do k = kmino,kmaxo
do j = jmino,jmaxo
do i = imino,imaxo
@@ -2449,7 +2558,7 @@
enddo
enddo
enddo
#endif
return
end subroutine restrict3

File diff suppressed because it is too large Load Diff

View File

@@ -27,19 +27,24 @@ using namespace std;
class surface_integral
{
private:
int Symmetry, factor;
int N_theta, N_phi; // Number of points in Theta & Phi directions
double dphi, dcostheta;
double *arcostheta, *wtcostheta;
int n_tot; // size of arrays
double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize;
public:
surface_integral(int iSymmetry);
~surface_integral();
private:
int Symmetry, factor;
int N_theta, N_phi; // Number of points in Theta & Phi directions
double dphi, dcostheta;
double *arcostheta, *wtcostheta;
int n_tot; // size of arrays
double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize;
int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
double *wave_theta_pos, *wave_theta_neg;
double *wave_phi_cos, *wave_phi_sin;
void clear_wave_cache();
void build_wave_cache(int spinw, int maxl);
public:
surface_integral(int iSymmetry);
~surface_integral();
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP,
@@ -77,21 +82,37 @@ public:
double &, double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double &, double &)); // NN is the length of RP and IP
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *chix, var *chiy, var *chiz,
var *trKx, var *trKy, var *trKz,
@@ -110,12 +131,12 @@ public:
bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
int NN, double **XX, double *Shellf);
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here);
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -144,6 +144,62 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 )
print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU
# use GPU or not
@@ -224,6 +280,21 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 )
print( "//", file=file1 )