Compare commits

...

27 Commits

Author SHA1 Message Date
3f3f16e881 Switch legacy build to GCC and OpenMPI 2026-04-13 19:39:30 +08:00
9c31384b2f Add optional BSSN kernel profiling switches 2026-04-13 16:51:06 +08:00
e4e741caa1 Remove dead chi derivative setup in BSSN RHS 2026-04-13 15:55:43 +08:00
65e0f95f40 Localize chi Ricci intermediates in RHS 2026-04-13 15:14:31 +08:00
f9fbf97e64 Elide dead stores in BSSN RHS hot path 2026-04-13 15:10:22 +08:00
968522995b Add fine-grained step timing and trim BH RHS overhead 2026-04-13 14:50:55 +08:00
f3988ac8ca Merge wave and mass extraction interpolation 2026-04-13 13:17:36 +08:00
e4c25eb21f Cache wave extraction angular kernels 2026-04-13 12:40:20 +08:00
4b10519876 Reuse mass integrand across detector radii 2026-04-13 11:55:41 +08:00
3a58273501 Batch constraint norm reductions 2026-04-13 11:48:02 +08:00
5c65cea2f0 Optimize constraint refresh after regrid 2026-04-13 11:39:50 +08:00
8c1f4d8108 迁移C算子的循环融合和临时量消除 2026-03-03 16:20:15 +08:00
d310ef918b bssn_rhs(fortran): migrate C kernel loop-fusion optimizations 2026-03-03 16:20:15 +08:00
b35e1b289f 设置开关关闭内存打印统计 2026-03-03 16:17:47 +08:00
05851b2c59 关闭静态负载 2026-03-03 16:17:47 +08:00
3b39583d67 fix(bssn_rhs) 2026-03-03 16:06:33 +08:00
688bdb6708 Merge pull request 'cjy-dystopia' (#3) from cjy-dystopia into main
Reviewed-on: #3
2026-03-02 21:36:26 +08:00
5070134857 perf(transfer_cached): 将 per-call new/delete 的 req_node/req_is_recv/completed 数组移入 SyncCache 复用
避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-02 21:14:35 +08:00
4012e9d068 perf(RestrictProlong): 用 Restrict_cached/OutBdLow2Hi_cached 替换非缓存版本,Sync_finish 改为渐进式解包
- RestrictProlong/RestrictProlong_aux 中的 Restrict() 和 OutBdLow2Hi() 替换为 _cached 版本,
  复用 gridseg 列表和 MPI 缓冲区,避免每次调用重新分配
- 新增 sync_cache_restrict/sync_cache_outbd 两组 per-level 缓存
- Sync_finish 从 MPI_Waitall 改为 MPI_Waitsome 渐进式解包,降低尾延迟
- AsyncSyncState 扩展 req_node/req_is_recv/pending_recv 字段支持渐进解包

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-02 20:48:38 +08:00
b3c367f15b prolong3 改为先算实际 stencil 窗口;只有窗口触及对称边界时才走全域 symmetry_bd,否则只复制必需窗口。restrict3 同样改成窗口判定,无触边时仅填 ii/jj/kk 必需窗口。 2026-03-02 17:38:56 +08:00
e73911f292 perf(restrict3): shrink X-pass ii sweep to required overlap window
- compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi
 - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes
 - keep stencil math unchanged; add bounds sanity check for ii window
2026-03-02 17:37:13 +08:00
7543d3e8c7 perf(MPatch): 用空间 bin 索引加速 Interp_Points 的 block 归属查找
- 为 Patch::Interp_Points 三个重载引入 BlockBinIndex(候选筛选 + 全扫回退)
  - 保持原 point-in-block 判定与后续插值/通信流程不变
  - 将逐点线性扫块从 O(N_points*N_blocks) 降为近似 O(N_points*k)
  - 测试:bin 上限如果太大,会引入不必要的索引构建开销。将 bins 上限设为 16。

Co-authored-by: gpt-5.3-codex
2026-03-02 17:37:13 +08:00
42c69fab24 refactor(Parallel): streamline MPI communication by consolidating request handling and memory management 2026-03-02 17:37:13 +08:00
95220a05c8 optimize fdderivs core-region branch elimination for ghost_width=3 2026-03-02 17:33:26 +08:00
466b084a58 fix prolong/restrict index bounds after cherry-pick 12e1f63 2026-03-02 13:59:47 +08:00
61ccef9f97 prolong3: 减少Z-pass 冗余计算 2026-03-02 13:58:52 +08:00
524d1d1512 Merge pull request 'cjy-dystopia' (#2) from cjy-dystopia into main
Reviewed-on: #2
2026-03-01 19:22:09 +08:00
25 changed files with 4168 additions and 2133 deletions

View File

@@ -37,51 +37,56 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!-------------
SUBROUTINE four1(dataa,nn,isign) SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none implicit none
INTEGER, intent(in) :: isign, nn INTEGER::isign,nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
type(DFTI_DESCRIPTOR), pointer :: desc double precision::tempi,tempr
integer :: status DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
! Create DFTI descriptor for 1D complex-to-complex transform j=1
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) do i=1,n,2
if (status /= 0) return if(j.gt.i)then
tempr=dataa(j)
! Set input/output storage as interleaved complex (default) tempi=dataa(j+1)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) dataa(j)=dataa(i)
if (status /= 0) then dataa(j+1)=dataa(i+1)
status = DftiFreeDescriptor(desc) dataa(i)=tempr
return dataa(i+1)=tempi
endif
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return return
END SUBROUTINE four1 END SUBROUTINE four1

View File

@@ -7,6 +7,7 @@
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
#include <vector>
using namespace std; using namespace std;
#include "misc.h" #include "misc.h"
@@ -17,6 +18,168 @@ using namespace std;
#include "interp_lb_profile.h" #include "interp_lb_profile.h"
#endif #endif
namespace
{
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
};
struct BlockBinIndex
{
int bins[dim];
double lo[dim];
double inv[dim];
vector<InterpBlockView> views;
vector<vector<int>> bin_to_blocks;
bool valid;
BlockBinIndex() : valid(false)
{
for (int i = 0; i < dim; i++)
{
bins[i] = 1;
lo[i] = 0.0;
inv[i] = 0.0;
}
}
};
inline int clamp_int(int v, int lo, int hi)
{
return (v < lo) ? lo : ((v > hi) ? hi : v);
}
inline int coord_to_bin(double x, double lo, double inv, int nb)
{
if (nb <= 1 || inv <= 0.0)
return 0;
int b = int(floor((x - lo) * inv));
return clamp_int(b, 0, nb - 1);
}
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
{
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
}
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
{
for (int i = 0; i < dim; i++)
{
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
return false;
}
return true;
}
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
{
index = BlockBinIndex();
MyList<Block> *Bp = patch->blb;
while (Bp)
{
Block *BP = Bp->data;
InterpBlockView view;
view.bp = BP;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
index.views.push_back(view);
if (Bp == patch->ble)
break;
Bp = Bp->next;
}
const int nblocks = int(index.views.size());
if (nblocks <= 0)
return;
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
bins_1d = clamp_int(bins_1d, 1, 32);
for (int i = 0; i < dim; i++)
{
index.bins[i] = bins_1d;
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
if (hi > index.lo[i] && bins_1d > 1)
index.inv[i] = bins_1d / (hi - index.lo[i]);
else
index.inv[i] = 0.0;
}
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
for (int bi = 0; bi < nblocks; bi++)
{
const InterpBlockView &view = index.views[bi];
int bmin[dim], bmax[dim];
for (int d = 0; d < dim; d++)
{
const double low = view.llb[d] - DH[d] / 2;
const double up = view.uub[d] + DH[d] / 2;
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
if (bmax[d] < bmin[d])
{
int t = bmin[d];
bmin[d] = bmax[d];
bmax[d] = t;
}
}
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
for (int by = bmin[1]; by <= bmax[1]; by++)
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
}
index.valid = true;
}
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{
if (!index.valid)
return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
for (size_t ci = 0; ci < cand.size(); ci++)
{
const int bi = cand[ci];
if (point_in_block_view(index.views[bi], pox, DH))
return bi;
}
// Fallback to full scan for numerical edge cases around bin boundaries.
for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH))
return int(bi);
return -1;
}
} // namespace
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{ {
@@ -367,9 +530,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
owner_rank[j] = -1; owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim]; double DH[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -392,57 +557,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MyList<Block> *Bp = blb; const int block_i = find_block_index_for_point(block_index, pox, DH);
bool notfind = true; if (block_i >= 0)
while (notfind && Bp) // run along Blocks
{ {
Block *BP = Bp->data; Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
bool flag = true; if (myrank == BP->rank)
for (int i = 0; i < dim; i++)
{ {
#ifdef Vertex //---> interpolation
#ifdef Cell varl = VarList;
#error Both Cell and Vertex are defined int k = 0;
#endif while (varl) // run along variables
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
flag = false; f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
break; pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }
@@ -535,9 +667,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
owner_rank[j] = -1; owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim]; double DH[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) --- // --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
@@ -561,56 +695,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MyList<Block> *Bp = blb; const int block_i = find_block_index_for_point(block_index, pox, DH);
bool notfind = true; if (block_i >= 0)
while (notfind && Bp)
{ {
Block *BP = Bp->data; Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
bool flag = true; if (myrank == BP->rank)
for (int i = 0; i < dim; i++)
{ {
#ifdef Vertex varl = VarList;
#ifdef Cell int k = 0;
#error Both Cell and Vertex are defined while (varl)
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
flag = false; f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
break; pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }
@@ -833,9 +934,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Comm_group(MPI_COMM_WORLD, &world_group); MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group); MPI_Comm_group(Comm_here, &local_group);
double DH[dim], llb[dim], uub[dim]; double DH[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -858,57 +961,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
MyList<Block> *Bp = blb; const int block_i = find_block_index_for_point(block_index, pox, DH);
bool notfind = true; if (block_i >= 0)
while (notfind && Bp) // run along Blocks
{ {
Block *BP = Bp->data; Block *BP = block_index.views[block_i].bp;
owner_rank[j] = BP->rank;
bool flag = true; if (myrank == BP->rank)
for (int i = 0; i < dim; i++)
{ {
#ifdef Vertex //---> interpolation
#ifdef Cell varl = VarList;
#error Both Cell and Vertex are defined int k = 0;
#endif while (varl) // run along variables
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
flag = false; f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
break; pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }

View File

@@ -3893,66 +3893,105 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
int node; int node;
MPI_Request *reqs; MPI_Request *reqs = new MPI_Request[2 * cpusize];
MPI_Status *stats; MPI_Status *stats = new MPI_Status[2 * cpusize];
reqs = new MPI_Request[2 * cpusize]; int *req_node = new int[2 * cpusize];
stats = new MPI_Status[2 * cpusize]; int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0; int req_no = 0;
int pending_recv = 0;
double **send_data, **rec_data; double **send_data = new double *[cpusize];
send_data = new double *[cpusize]; double **rec_data = new double *[cpusize];
rec_data = new double *[cpusize]; int *send_lengths = new int[cpusize];
int length; int *recv_lengths = new int[cpusize];
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
send_data[node] = rec_data[node] = 0; send_data[node] = rec_data[node] = 0;
if (node == myrank) send_lengths[node] = recv_lengths[node] = 0;
}
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{ {
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{ {
rec_data[node] = new double[length]; cout << "out of memory when new in short transfer, place 1" << endl;
if (!rec_data[node]) MPI_Abort(MPI_COMM_WORLD, 1);
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
} }
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
} }
else }
// Local transfer on this rank.
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{ {
// send from this cpu to cpu#node cout << "out of memory when new in short transfer, place 2" << endl;
if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{ {
send_data[node] = new double[length]; cout << "out of memory when new in short transfer, place 3" << endl;
if (!send_data[node]) MPI_Abort(MPI_COMM_WORLD, 1);
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
} }
// receive from cpu#node to this cpu data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{ {
rec_data[node] = new double[length]; int recv_node = req_node[idx];
if (!rec_data[node]) data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
{ pending_recv--;
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
} }
} }
} }
// wait for all requests to complete
MPI_Waitall(req_no, reqs, stats);
for (node = 0; node < cpusize; node++) if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[node])
data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); if (rec_data[myrank])
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
@@ -3964,8 +4003,13 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
delete[] reqs; delete[] reqs;
delete[] stats; delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data; delete[] send_data;
delete[] rec_data; delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
} }
// //
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst, void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
@@ -3978,66 +4022,105 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
int node; int node;
MPI_Request *reqs; MPI_Request *reqs = new MPI_Request[2 * cpusize];
MPI_Status *stats; MPI_Status *stats = new MPI_Status[2 * cpusize];
reqs = new MPI_Request[2 * cpusize]; int *req_node = new int[2 * cpusize];
stats = new MPI_Status[2 * cpusize]; int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0; int req_no = 0;
int pending_recv = 0;
double **send_data, **rec_data; double **send_data = new double *[cpusize];
send_data = new double *[cpusize]; double **rec_data = new double *[cpusize];
rec_data = new double *[cpusize]; int *send_lengths = new int[cpusize];
int length; int *recv_lengths = new int[cpusize];
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
send_data[node] = rec_data[node] = 0; send_data[node] = rec_data[node] = 0;
if (node == myrank) send_lengths[node] = recv_lengths[node] = 0;
}
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{ {
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{ {
rec_data[node] = new double[length]; cout << "out of memory when new in short transfer, place 1" << endl;
if (!rec_data[node]) MPI_Abort(MPI_COMM_WORLD, 1);
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
} }
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
} }
else }
// Local transfer on this rank.
recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{ {
// send from this cpu to cpu#node cout << "out of memory when new in short transfer, place 2" << endl;
if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{ {
send_data[node] = new double[length]; cout << "out of memory when new in short transfer, place 3" << endl;
if (!send_data[node]) MPI_Abort(MPI_COMM_WORLD, 1);
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
} }
// receive from cpu#node to this cpu data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{ {
rec_data[node] = new double[length]; int recv_node = req_node[idx];
if (!rec_data[node]) data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
{ pending_recv--;
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
} }
} }
} }
// wait for all requests to complete
MPI_Waitall(req_no, reqs, stats);
for (node = 0; node < cpusize; node++) if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[node])
data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); if (rec_data[myrank])
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
@@ -4049,8 +4132,13 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
delete[] reqs; delete[] reqs;
delete[] stats; delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data; delete[] send_data;
delete[] rec_data; delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
} }
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry) void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
{ {
@@ -4232,7 +4320,7 @@ Parallel::SyncCache::SyncCache()
: valid(false), cpusize(0), combined_src(0), combined_dst(0), : valid(false), cpusize(0), combined_src(0), combined_dst(0),
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0), send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
lengths_valid(false) lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
{ {
} }
// SyncCache invalidate: free grid segment lists but keep buffers // SyncCache invalidate: free grid segment lists but keep buffers
@@ -4271,11 +4359,15 @@ void Parallel::SyncCache::destroy()
if (recv_bufs) delete[] recv_bufs; if (recv_bufs) delete[] recv_bufs;
if (reqs) delete[] reqs; if (reqs) delete[] reqs;
if (stats) delete[] stats; if (stats) delete[] stats;
if (tc_req_node) delete[] tc_req_node;
if (tc_req_is_recv) delete[] tc_req_is_recv;
if (tc_completed) delete[] tc_completed;
combined_src = combined_dst = 0; combined_src = combined_dst = 0;
send_lengths = recv_lengths = 0; send_lengths = recv_lengths = 0;
send_buf_caps = recv_buf_caps = 0; send_buf_caps = recv_buf_caps = 0;
send_bufs = recv_bufs = 0; send_bufs = recv_bufs = 0;
reqs = 0; stats = 0; reqs = 0; stats = 0;
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
cpusize = 0; max_reqs = 0; cpusize = 0; max_reqs = 0;
} }
// transfer_cached: reuse pre-allocated buffers from SyncCache // transfer_cached: reuse pre-allocated buffers from SyncCache
@@ -4289,64 +4381,96 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
int cpusize = cache.cpusize; int cpusize = cache.cpusize;
int req_no = 0; int req_no = 0;
int pending_recv = 0;
int node; int node;
int *req_node = cache.tc_req_node;
int *req_is_recv = cache.tc_req_is_recv;
int *completed = cache.tc_completed;
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
if (node == myrank) if (node == myrank) continue;
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
{ {
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); if (rlength > cache.recv_buf_caps[node])
cache.recv_lengths[node] = length;
if (length > 0)
{ {
if (length > cache.recv_buf_caps[node]) if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
{ cache.recv_bufs[node] = new double[rlength];
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; cache.recv_buf_caps[node] = rlength;
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
} }
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
} }
else }
// Local transfer on this rank.
int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[myrank] = self_len;
if (self_len > 0)
{
if (self_len > cache.recv_buf_caps[myrank])
{ {
// send if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); cache.recv_bufs[myrank] = new double[self_len];
cache.send_lengths[node] = slength; cache.recv_buf_caps[myrank] = self_len;
if (slength > 0) }
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{ {
if (slength > cache.send_buf_caps[node]) if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
{ cache.send_bufs[node] = new double[slength];
if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; cache.send_buf_caps[node] = slength;
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
} }
// recv data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
cache.recv_lengths[node] = rlength; req_node[req_no] = node;
if (rlength > 0) req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{ {
if (rlength > cache.recv_buf_caps[node]) int recv_node_i = req_node[idx];
{ data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; pending_recv--;
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
} }
} }
} }
MPI_Waitall(req_no, cache.reqs, cache.stats); if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
for (node = 0; node < cpusize; node++) if (self_len > 0)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
} }
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache) void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
{ {
if (!cache.valid) if (!cache.valid)
@@ -4374,6 +4498,9 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
cache.max_reqs = 2 * cpusize; cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs]; cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
} }
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)
@@ -4474,6 +4601,9 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.max_reqs = 2 * cpusize; cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs]; cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
} }
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)
@@ -4544,6 +4674,11 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
int cpusize = cache.cpusize; int cpusize = cache.cpusize;
state.req_no = 0; state.req_no = 0;
state.active = true; state.active = true;
state.pending_recv = 0;
// Allocate tracking arrays
delete[] state.req_node; delete[] state.req_is_recv;
state.req_node = new int[cache.max_reqs];
state.req_is_recv = new int[cache.max_reqs];
MyList<Parallel::gridseg> **src = cache.combined_src; MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst; MyList<Parallel::gridseg> **dst = cache.combined_dst;
@@ -4588,6 +4723,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.send_buf_caps[node] = slength; cache.send_buf_caps[node] = slength;
} }
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 0;
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
} }
int rlength; int rlength;
@@ -4605,29 +4742,60 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cache.recv_bufs[node] = new double[rlength]; cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength; cache.recv_buf_caps[node] = rlength;
} }
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 1;
state.pending_recv++;
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
} }
} }
} }
cache.lengths_valid = true; cache.lengths_valid = true;
} }
// Sync_finish: wait for async MPI operations and unpack // Sync_finish: progressive unpack as receives complete, then wait for sends
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry) MyList<var> *VarList, int Symmetry)
{ {
if (!state.active) if (!state.active)
return; return;
MPI_Waitall(state.req_no, cache.reqs, cache.stats); int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
MyList<Parallel::gridseg> **src = cache.combined_src; MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst; MyList<Parallel::gridseg> **dst = cache.combined_dst;
for (int node = 0; node < cpusize; node++) // Unpack local data first (no MPI needed)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry);
// Progressive unpack of remote receives
if (state.pending_recv > 0 && state.req_no > 0)
{
int pending = state.pending_recv;
int *completed = new int[cache.max_reqs];
while (pending > 0)
{
int outcount = 0;
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && state.req_is_recv[idx])
{
int recv_node = state.req_node[idx];
data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry);
pending--;
}
}
}
delete[] completed;
}
// Wait for remaining sends
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
delete[] state.req_node; state.req_node = 0;
delete[] state.req_is_recv; state.req_is_recv = 0;
state.active = false; state.active = false;
} }
// collect buffer grid segments or blocks for the periodic boundary condition of given patch // collect buffer grid segments or blocks for the periodic boundary condition of given patch
@@ -5116,6 +5284,41 @@ double Parallel::L2Norm(Patch *Pat, var *vf)
return tvf; return tvf;
} }
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here) double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{ {
int myrank; int myrank;
@@ -5147,6 +5350,41 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
return tvf; return tvf;
} }
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only) void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{ {
int myrank = 0; int myrank = 0;
@@ -5694,6 +5932,9 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize; cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs]; cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
} }
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL); MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
@@ -5740,6 +5981,9 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize; cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs]; cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
} }
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL); MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5786,6 +6030,9 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.max_reqs = 2 * cpusize; cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs]; cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs];
cache.tc_req_node = new int[cache.max_reqs];
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
} }
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL); MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
@@ -5807,58 +6054,98 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
int cpusize = cache.cpusize; int cpusize = cache.cpusize;
int req_no = 0; int req_no = 0;
int pending_recv = 0;
int *req_node = new int[cache.max_reqs];
int *req_is_recv = new int[cache.max_reqs];
int *completed = new int[cache.max_reqs];
// Post receives first so peers can progress rendezvous early.
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)
{ {
if (node == myrank) if (node == myrank) continue;
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
{ {
int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); if (rlength > cache.recv_buf_caps[node])
cache.recv_lengths[node] = length;
if (length > 0)
{ {
if (length > cache.recv_buf_caps[node]) if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
{ cache.recv_bufs[node] = new double[rlength];
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; cache.recv_buf_caps[node] = rlength;
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
} }
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
} }
else }
// Local transfer on this rank.
int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[myrank] = self_len;
if (self_len > 0)
{
if (self_len > cache.recv_buf_caps[myrank])
{ {
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
cache.send_lengths[node] = slength; cache.recv_bufs[myrank] = new double[self_len];
if (slength > 0) cache.recv_buf_caps[myrank] = self_len;
}
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (int node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{ {
if (slength > cache.send_buf_caps[node]) if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
{ cache.send_bufs[node] = new double[slength];
if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; cache.send_buf_caps[node] = slength;
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
} }
int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength; MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
if (rlength > 0) req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
int idx = completed[i];
if (idx >= 0 && req_is_recv[idx])
{ {
if (rlength > cache.recv_buf_caps[node]) int recv_node_i = req_node[idx];
{ data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry);
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; pending_recv--;
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
} }
} }
} }
MPI_Waitall(req_no, cache.reqs, cache.stats); if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
for (int node = 0; node < cpusize; node++) if (self_len > 0)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
} }
// collect all buffer grid segments or blocks for given patch // collect all buffer grid segments or blocks for given patch

View File

@@ -108,6 +108,9 @@ namespace Parallel
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid; bool lengths_valid;
int *tc_req_node;
int *tc_req_is_recv;
int *tc_completed;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();
@@ -121,7 +124,10 @@ namespace Parallel
struct AsyncSyncState { struct AsyncSyncState {
int req_no; int req_no;
bool active; bool active;
AsyncSyncState() : req_no(0), active(false) {} int *req_node;
int *req_is_recv;
int pending_recv;
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
}; };
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
@@ -177,6 +183,7 @@ namespace Parallel
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst); MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry); void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf); double L2Norm(Patch *Pat, var *vf);
void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only); void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only); void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
@@ -212,6 +219,7 @@ namespace Parallel
void checkpatchlist(MyList<Patch> *PatL, bool buflog); void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList, bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -3472,6 +3472,43 @@ double ShellPatch::L2Norm(var *vf)
return tvf; return tvf;
} }
void ShellPatch::L2Norm7(var **vf, double *norms)
{
double tvf[7], dtvf[7];
int BDW = overghost;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<ss_patch> *sPp = PatL;
while (sPp)
{
MyList<Block> *Bp = sPp->data->blb;
while (Bp)
{
Block *cg = Bp->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (Bp == sPp->data->ble)
break;
Bp = Bp->next;
}
sPp = sPp->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs // find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX, void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,

View File

@@ -198,6 +198,7 @@ public:
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax, void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst); char *filename, int sst);
double L2Norm(var *vf); double L2Norm(var *vf);
void L2Norm7(var **vf, double *norms);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf); void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
}; };

View File

@@ -27,7 +27,21 @@ using namespace std;
#endif #endif
#include "TwoPunctures.h" #include "TwoPunctures.h"
#include <mkl_cblas.h>
extern "C" {
double cblas_ddot(const int, const double *, const int, const double *, const int);
double cblas_dnrm2(const int, const double *, const int);
void cblas_dgemm(const int, const int, const int,
const int, const int, const int,
const double, const double *, const int,
const double *, const int, const double,
double *, const int);
}
enum {
CblasRowMajor = 101,
CblasNoTrans = 111
};
TwoPunctures::TwoPunctures(double mp, double mm, double b, TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz, double P_plusx, double P_plusy, double P_plusz,

View File

@@ -41,6 +41,239 @@ using namespace std;
#include "derivatives.h" #include "derivatives.h"
#include "ricci_gamma.h" #include "ricci_gamma.h"
// Compile-time switch for per-timestep memory usage collection/printing.
// Default is OFF to reduce overhead in production runs.
#ifndef BSSN_ENABLE_MEM_USAGE_LOG
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
#ifndef BSSN_FINE_TIMING
#define BSSN_FINE_TIMING 0
#endif
#ifndef BSSN_FINE_TIMING_EVERY
#define BSSN_FINE_TIMING_EVERY 1
#endif
#ifndef BSSN_FINE_TIMING_TOPN
#define BSSN_FINE_TIMING_TOPN 8
#endif
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#ifndef BSSN_ENABLE_STDIN_ABORT_POLL
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
#endif
#if BSSN_FINE_TIMING
namespace step_timing
{
enum Bucket
{
TB_ANALYSIS_PSI4 = 0,
TB_ANALYSIS_SURFACE,
TB_ANALYSIS_IO,
TB_BH_PREDICTOR,
TB_PREDICTOR_RHS,
TB_PREDICTOR_SYNC,
TB_BH_CORRECTOR,
TB_CORRECTOR_RHS,
TB_CORRECTOR_SYNC,
TB_STATE_SWAP,
TB_RESTRICT_PROLONG,
TB_CONSTRAINT_OUT,
TB_DUMP_3D,
TB_DUMP_2D,
TB_CHECKPOINT,
TB_REGRID,
TB_COUNT
};
static double local_bucket_seconds[TB_COUNT];
static const char *bucket_labels[TB_COUNT] =
{
"analysis_psi4",
"analysis_surface",
"analysis_io",
"bh_predictor",
"predictor_rhs",
"predictor_sync",
"bh_corrector",
"corrector_rhs",
"corrector_sync",
"state_swap",
"restrict_prolong",
"constraint_out",
"dump_3d",
"dump_2d",
"checkpoint",
"regrid"
};
void reset()
{
for (int i = 0; i < TB_COUNT; i++)
local_bucket_seconds[i] = 0.0;
}
void add(Bucket bucket, double seconds)
{
local_bucket_seconds[int(bucket)] += seconds;
}
void report(int myrank, int nprocs, monitor *TimingMonitor,
int step_index, double phys_time, double step_wall_seconds)
{
double max_bucket_seconds[TB_COUNT];
double avg_bucket_seconds[TB_COUNT];
MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
for (int i = 0; i < TB_COUNT; i++)
avg_bucket_seconds[i] /= Mymax(1, nprocs);
if (TimingMonitor)
{
double row[2 + 2 * TB_COUNT];
row[0] = double(step_index);
row[1] = step_wall_seconds;
for (int i = 0; i < TB_COUNT; i++)
{
row[2 + i] = max_bucket_seconds[i];
row[2 + TB_COUNT + i] = avg_bucket_seconds[i];
}
TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row);
}
double residual = step_wall_seconds;
for (int i = 0; i < TB_COUNT; i++)
residual -= max_bucket_seconds[i];
if (residual < 0.0)
residual = 0.0;
int order[TB_COUNT];
for (int i = 0; i < TB_COUNT; i++)
order[i] = i;
for (int i = 0; i < TB_COUNT - 1; i++)
for (int j = i + 1; j < TB_COUNT; j++)
if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]])
{
int tmp = order[i];
order[i] = order[j];
order[j] = tmp;
}
ios::fmtflags old_flags = cout.flags();
streamsize old_precision = cout.precision();
cout << " Fine timing hot spots (max rank wall estimate):" << endl;
const int topn = Mymin(BSSN_FINE_TIMING_TOPN, TB_COUNT);
for (int i = 0; i < topn; i++)
{
const int ib = order[i];
const double frac = (step_wall_seconds > 0.0) ? (100.0 * max_bucket_seconds[ib] / step_wall_seconds) : 0.0;
cout << " "
<< setw(20) << left << bucket_labels[ib]
<< " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib]
<< " s (" << setw(6) << setprecision(4) << frac << "%)" << endl;
}
if (residual > 1.0e-6)
{
const double frac = (step_wall_seconds > 0.0) ? (100.0 * residual / step_wall_seconds) : 0.0;
cout << " "
<< setw(20) << left << "unprofiled_residual"
<< " = " << setw(10) << right << setprecision(6) << residual
<< " s (" << setw(6) << setprecision(4) << frac << "%)" << endl;
}
cout << endl;
cout.flags(old_flags);
cout.precision(old_precision);
}
}
#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime()
#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name))
#else
#define STEP_TIMER_DECL(var_name)
#define STEP_TIMER_ADD(bucket_name, var_name)
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing_report
{
void report(int myrank, int nprocs, int step_index, double step_wall_seconds)
{
const int bucket_count = f_bssn_rhs_kernel_timing_bucket_count();
const double *local_bucket_seconds = f_bssn_rhs_kernel_timing_local_seconds();
if (bucket_count <= 0 || !local_bucket_seconds)
return;
double *max_bucket_seconds = new double[bucket_count];
double *avg_bucket_seconds = new double[bucket_count];
int *order = new int[bucket_count];
MPI_Reduce((void *)local_bucket_seconds, max_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce((void *)local_bucket_seconds, avg_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myrank == 0)
{
double kernel_total = 0.0;
for (int i = 0; i < bucket_count; ++i)
{
avg_bucket_seconds[i] /= Mymax(1, nprocs);
order[i] = i;
kernel_total += max_bucket_seconds[i];
}
for (int i = 0; i < bucket_count - 1; ++i)
for (int j = i + 1; j < bucket_count; ++j)
if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]])
{
int tmp = order[i];
order[i] = order[j];
order[j] = tmp;
}
ios::fmtflags old_flags = cout.flags();
streamsize old_precision = cout.precision();
const double kernel_frac = (step_wall_seconds > 0.0) ? (100.0 * kernel_total / step_wall_seconds) : 0.0;
cout << " RHS kernel split (max-rank accumulated over step " << step_index << "): total "
<< setprecision(6) << kernel_total << " s (" << setprecision(4)
<< kernel_frac << "% of coarse step)" << endl;
const int topn = Mymin(BSSN_FINE_TIMING_TOPN, bucket_count);
for (int i = 0; i < topn; ++i)
{
const int ib = order[i];
const double frac = (kernel_total > 0.0) ? (100.0 * max_bucket_seconds[ib] / kernel_total) : 0.0;
cout << " "
<< setw(20) << left << f_bssn_rhs_kernel_timing_label(ib)
<< " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib]
<< " s (" << setw(6) << setprecision(4) << frac << "% of kernel)" << endl;
}
cout << endl;
cout.flags(old_flags);
cout.precision(old_precision);
}
delete[] max_bucket_seconds;
delete[] avg_bucket_seconds;
delete[] order;
}
}
#endif
//================================================================================================ //================================================================================================
// define bssn_class // define bssn_class
@@ -59,6 +292,7 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
xc(0), yc(0), zc(0), xr(0), yr(0), zr(0), trigger(0), dTT(0), dumpid(0), xc(0), yc(0), zc(0), xr(0), yr(0), zr(0), trigger(0), dTT(0), dumpid(0),
#endif #endif
a_lev(a_levi), maxl(maxli), decn(decni), maxrex(maxrexi), drex(drexi), a_lev(a_levi), maxl(maxli), decn(decni), maxrex(maxrexi), drex(drexi),
ConstraintRefreshLevels(0),
CheckPoint(0) CheckPoint(0)
// CheckPoint(0) // CheckPoint(0)
{ {
@@ -101,6 +335,24 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
a_stream.str(""); a_stream.str("");
a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz"; a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz";
ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str()); ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str());
#if BSSN_FINE_TIMING
a_stream.clear();
a_stream.str("");
a_stream << setw(8) << "# step";
a_stream << setw(14) << "wall";
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
a_stream << setw(18) << step_timing::bucket_labels[ib];
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
{
char str_avg[64];
sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]);
a_stream << setw(18) << str_avg;
}
TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str());
#else
TimingMonitor = 0;
#endif
} }
// setup sphere integration engine // setup sphere integration engine
Waveshell = new surface_integral(Symmetry); Waveshell = new surface_integral(Symmetry);
@@ -696,6 +948,9 @@ void bssn_class::Initialize()
} }
} }
GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor); GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
ConstraintRefreshLevels = new int[GH->levels];
for (int il = 0; il < GH->levels; il++)
ConstraintRefreshLevels[il] = 0;
if (checkrun) if (checkrun)
CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry); CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
else else
@@ -736,6 +991,8 @@ void bssn_class::Initialize()
sync_cache_cor = new Parallel::SyncCache[GH->levels]; sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
} }
//================================================================================================ //================================================================================================
@@ -783,6 +1040,8 @@ bssn_class::~bssn_class()
DumpList->clearList(); DumpList->clearList();
ConstraintList->clearList(); ConstraintList->clearList();
delete[] ConstraintRefreshLevels;
delete phio; delete phio;
delete trKo; delete trKo;
delete gxxo; delete gxxo;
@@ -1042,6 +1301,7 @@ bssn_class::~bssn_class()
delete BHMonitor; delete BHMonitor;
delete MAPMonitor; delete MAPMonitor;
delete ConVMonitor; delete ConVMonitor;
delete TimingMonitor;
delete Waveshell; delete Waveshell;
delete CheckPoint; delete CheckPoint;
@@ -2127,8 +2387,10 @@ void bssn_class::Evolve(int Steps)
#endif #endif
*/ */
#if BSSN_ENABLE_MEM_USAGE_LOG
perf bssn_perf; perf bssn_perf;
size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max; size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
#endif
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
GH->Lt[lev] = PhysTime; GH->Lt[lev] = PhysTime;
@@ -2137,6 +2399,15 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++) for (int ncount = 1; ncount < Steps + 1; ncount++)
{ {
#if BSSN_FINE_TIMING
step_timing::reset();
#endif
#if BSSN_KERNEL_FINE_TIMING
f_bssn_rhs_kernel_timing_reset();
#endif
#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING)
const double step_wall_start = MPI_Wtime();
#endif
// special for large mass ratio consideration // special for large mass ratio consideration
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
// { GH->levels=GH->movls; } // { GH->levels=GH->movls; }
@@ -2163,6 +2434,7 @@ void bssn_class::Evolve(int Steps)
// When LastDump >= DumpTime, output corresponding binary data // When LastDump >= DumpTime, output corresponding binary data
if (LastDump >= DumpTime) if (LastDump >= DumpTime)
{ {
STEP_TIMER_DECL(timer_dump3d);
// misc::tillherecheck("before Dump_Data"); // misc::tillherecheck("before Dump_Data");
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
@@ -2170,6 +2442,7 @@ void bssn_class::Evolve(int Steps)
#ifdef WithShell #ifdef WithShell
SH->Dump_Data(DumpList, 0, PhysTime, dT_mon); SH->Dump_Data(DumpList, 0, PhysTime, dT_mon);
#endif #endif
STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d);
LastDump = 0; LastDump = 0;
@@ -2182,10 +2455,12 @@ void bssn_class::Evolve(int Steps)
// When Last2dDump >= d2DumpTime, output corresponding 2D data // When Last2dDump >= d2DumpTime, output corresponding 2D data
if (Last2dDump >= d2DumpTime) if (Last2dDump >= d2DumpTime)
{ {
STEP_TIMER_DECL(timer_dump2d);
// misc::tillherecheck("before 2dDump_Data"); // misc::tillherecheck("before 2dDump_Data");
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
Last2dDump = 0; Last2dDump = 0;
@@ -2210,10 +2485,12 @@ void bssn_class::Evolve(int Steps)
break; break;
#if (REGLEV == 1) #if (REGLEV == 1)
STEP_TIMER_DECL(timer_regrid);
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
STEP_TIMER_ADD(TB_REGRID, timer_regrid);
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2222,6 +2499,7 @@ void bssn_class::Evolve(int Steps)
// fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor); // fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor);
#endif #endif
#if BSSN_ENABLE_MEM_USAGE_LOG
// Retrieve memory usage information used during computation; master process prints it // Retrieve memory usage information used during computation; master process prints it
bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max, bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
&peak_min, &peak_avg, &peak_max, nprocs); &peak_min, &peak_avg, &peak_max, nprocs);
@@ -2237,6 +2515,7 @@ void bssn_class::Evolve(int Steps)
(double)peak_max / (1024.0 * 1024.0)); (double)peak_max / (1024.0 * 1024.0));
cout << endl; cout << endl;
} }
#endif
// Output puncture positions at each step // Output puncture positions at each step
if (myrank == 0) if (myrank == 0)
@@ -2251,10 +2530,13 @@ void bssn_class::Evolve(int Steps)
<< endl; << endl;
} }
cout << endl; cout << endl;
#if BSSN_ENABLE_STDIN_ABORT_POLL
cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl; cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl;
#endif
// cout << endl; // cout << endl;
} }
#if BSSN_ENABLE_STDIN_ABORT_POLL
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// If an "abort" command is detected on stdin, terminate MPI processes // If an "abort" command is detected on stdin, terminate MPI processes
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@@ -2282,10 +2564,12 @@ void bssn_class::Evolve(int Steps)
} }
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
#endif
// When LastCheck >= CheckTime, perform runtime checks and output status data // When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime) if (LastCheck >= CheckTime)
{ {
STEP_TIMER_DECL(timer_checkpoint);
LastCheck = 0; LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
@@ -2294,7 +2578,20 @@ void bssn_class::Evolve(int Steps)
CheckPoint->writecheck_sh(PhysTime, SH); CheckPoint->writecheck_sh(PhysTime, SH);
#endif #endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint);
} }
#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING)
const double step_wall_seconds = MPI_Wtime() - step_wall_start;
#endif
#if BSSN_FINE_TIMING
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, step_wall_seconds);
#endif
#if BSSN_KERNEL_FINE_TIMING
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
rhs_kernel_timing_report::report(myrank, nprocs, ncount, step_wall_seconds);
#endif
} }
/* /*
#ifdef With_AHF #ifdef With_AHF
@@ -2426,10 +2723,16 @@ void bssn_class::RecursiveStep(int lev)
#endif #endif
#if (REGLEV == 0) #if (REGLEV == 0)
STEP_TIMER_DECL(timer_regrid_onelevel);
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } {
if (ConstraintRefreshLevels)
ConstraintRefreshLevels[lev] = 1;
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
}
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif #endif
} }
@@ -2608,7 +2911,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif #endif
} }
@@ -2775,7 +3078,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2790,7 +3093,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2809,7 +3112,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2825,7 +3128,7 @@ void bssn_class::ParallelStep()
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3022,6 +3325,7 @@ void bssn_class::Step(int lev, int YN)
// new code 2013-2-15, zjcao // new code 2013-2-15, zjcao
#if (MAPBH == 1) #if (MAPBH == 1)
STEP_TIMER_DECL(timer_bh_predictor);
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
@@ -3052,6 +3356,7 @@ void bssn_class::Step(int lev, int YN)
} }
} }
} }
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
// data analysis part // data analysis part
// Warning NOTE: the variables1 are used as temp storege room // Warning NOTE: the variables1 are used as temp storege room
@@ -3074,6 +3379,7 @@ void bssn_class::Step(int lev, int YN)
int ERROR = 0; int ERROR = 0;
MyList<ss_patch> *sPp; MyList<ss_patch> *sPp;
STEP_TIMER_DECL(timer_predictor_rhs);
// Predictor // Predictor
MyList<Patch> *Pp = GH->PatL[lev]; MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
@@ -3349,6 +3655,9 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs);
STEP_TIMER_DECL(timer_predictor_sync);
Parallel::AsyncSyncState async_pre; Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -3386,6 +3695,7 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -3430,6 +3740,7 @@ void bssn_class::Step(int lev, int YN)
// corrector // corrector
for (iter_count = 1; iter_count < 4; iter_count++) for (iter_count = 1; iter_count < 4; iter_count++)
{ {
STEP_TIMER_DECL(timer_corrector_rhs);
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3) if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2; TRK4 += dT_lev / 2;
@@ -3709,6 +4020,9 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs);
STEP_TIMER_DECL(timer_corrector_sync);
Parallel::AsyncSyncState async_cor; Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -3748,8 +4062,10 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
#if (MAPBH == 0) #if (MAPBH == 0)
STEP_TIMER_DECL(timer_bh_corrector);
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
@@ -3782,11 +4098,13 @@ void bssn_class::Step(int lev, int YN)
} }
} }
} }
STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector);
#endif #endif
// swap time level // swap time level
if (iter_count < 3) if (iter_count < 3)
{ {
STEP_TIMER_DECL(timer_state_swap);
Pp = GH->PatL[lev]; Pp = GH->PatL[lev];
while (Pp) while (Pp)
{ {
@@ -3833,9 +4151,11 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap);
} }
} }
#if (RPS == 0) #if (RPS == 0)
STEP_TIMER_DECL(timer_restrict_prolong);
// mesh refinement boundary part // mesh refinement boundary part
RestrictProlong(lev, YN, BB); RestrictProlong(lev, YN, BB);
@@ -3856,6 +4176,7 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
#endif #endif
// note the data structure before update // note the data structure before update
// SynchList_cor 1 ----------- // SynchList_cor 1 -----------
@@ -3864,6 +4185,7 @@ void bssn_class::Step(int lev, int YN)
// //
// OldStateList old ----------- // OldStateList old -----------
// update // update
STEP_TIMER_DECL(timer_state_commit);
Pp = GH->PatL[lev]; Pp = GH->PatL[lev];
while (Pp) while (Pp)
{ {
@@ -3920,6 +4242,7 @@ void bssn_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2]; Porg0[ithBH][2] = Porg1[ithBH][2];
} }
} }
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit);
} }
//================================================================================================ //================================================================================================
@@ -4246,7 +4569,9 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
STEP_TIMER_DECL(timer_bh_predictor);
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
@@ -4285,6 +4610,7 @@ void bssn_class::Step(int lev, int YN)
{ {
AnalysisStuff(lev, dT_lev); AnalysisStuff(lev, dT_lev);
} }
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
// corrector // corrector
for (iter_count = 1; iter_count < 3; iter_count++) for (iter_count = 1; iter_count < 3; iter_count++)
{ {
@@ -5755,6 +6081,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// //
// SynchList_cor old ----------- // SynchList_cor old -----------
{ {
STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// stringstream a_stream; // stringstream a_stream;
// a_stream.setf(ios::left); // a_stream.setf(ios::left);
@@ -5796,7 +6123,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5820,7 +6147,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
#endif #endif
@@ -5847,7 +6174,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5871,7 +6198,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#endif #endif
@@ -5897,6 +6224,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
} }
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
} }
//================================================================================================ //================================================================================================
@@ -5916,6 +6244,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
// //
// SynchList_cor old ----------- // SynchList_cor old -----------
{ {
STEP_TIMER_DECL(timer_restrict_prolong);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
if (lev >= GH->levels - 1) if (lev >= GH->levels - 1)
@@ -5940,7 +6269,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
} }
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5950,7 +6279,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
#endif #endif
@@ -5962,7 +6291,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
else // no time refinement levels and for all same time levels else // no time refinement levels and for all same time levels
{ {
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5972,7 +6301,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#endif #endif
@@ -5984,6 +6313,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
} }
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
} }
//================================================================================================ //================================================================================================
@@ -5994,6 +6324,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
void bssn_class::RestrictProlong(int lev, int YN, bool BB) void bssn_class::RestrictProlong(int lev, int YN, bool BB)
{ {
STEP_TIMER_DECL(timer_restrict_prolong);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
// we assume for fine // we assume for fine
// SynchList_cor 1 ----------- // SynchList_cor 1 -----------
@@ -6027,7 +6358,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
} }
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6037,7 +6368,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
@@ -6051,7 +6382,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
if (myrank == 0) if (myrank == 0)
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
#if (RPB == 0) #if (RPB == 0)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
@@ -6061,7 +6392,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
#endif #endif
@@ -6073,6 +6404,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
} }
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
} }
//================================================================================================ //================================================================================================
@@ -6102,7 +6434,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
@@ -6115,7 +6447,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
{ {
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
#endif #endif
@@ -6823,18 +7155,15 @@ void bssn_class::compute_Porg_rhs(double **BH_PS,double **BH_RHS,var *forx,var *
void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int ilev) void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int ilev)
{ {
const int InList = 3; MyList<var> DG_List_x(forx);
MyList<var> DG_List_y(fory);
MyList<var> DG_List_z(forz);
DG_List_x.next = &DG_List_y;
DG_List_y.next = &DG_List_z;
MyList<var> *DG_List = new MyList<var>(forx); double shellf[3];
DG_List->insert(fory); double pox_buf[3][1];
DG_List->insert(forz); double *pox[3] = {pox_buf[0], pox_buf[1], pox_buf[2]};
double *x1, *y1, *z1;
double *shellf;
shellf = new double[3];
double *pox[3];
for (int i = 0; i < 3; i++)
pox[i] = new double[1];
for (int n = 0; n < BH_num; n++) for (int n = 0; n < BH_num; n++)
{ {
@@ -6845,9 +7174,9 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
int lev = ilev; int lev = ilev;
#if (PSTR == 0) #if (PSTR == 0)
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry)) while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry))
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry, GH->Commlev[lev])) while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry, GH->Commlev[lev]))
#endif #endif
{ {
lev--; lev--;
@@ -6856,7 +7185,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
ErrorMonitor->outfile << "fail to find black holes at t = " << PhysTime << endl; ErrorMonitor->outfile << "fail to find black holes at t = " << PhysTime << endl;
for (n = 0; n < BH_num; n++) for (n = 0; n < BH_num; n++)
ErrorMonitor->outfile << "(x,y,z) = (" ErrorMonitor->outfile << "(x,y,z) = ("
<< pox[0][n] << "," << pox[1][n] << "," << pox[2][n] << BH_PS[n][0] << "," << BH_PS[n][1] << "," << BH_PS[n][2]
<< ")" << endl; << ")" << endl;
break; break;
} }
@@ -6869,11 +7198,6 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
BH_RHS[n][2] = -shellf[2]; BH_RHS[n][2] = -shellf[2];
} }
} }
DG_List->clearList();
delete[] shellf;
for (int i = 0; i < 3; i++)
delete[] pox[i];
} }
#endif #endif
@@ -7096,6 +7420,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
IP = new double[NN]; IP = new double[NN];
RoutMAP = new double[7]; RoutMAP = new double[7];
double Rex = maxrex; double Rex = maxrex;
bool patch_mass_prepared = false;
#ifdef WithShell
bool shell_mass_prepared = false;
#endif
for (int i = 0; i < decn; i++) for (int i = 0; i < decn; i++)
{ {
#ifdef Point_Psi4 #ifdef Point_Psi4
@@ -7123,7 +7451,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor); RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
} }
else else
{ {
@@ -7131,44 +7460,52 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor); RoutMAP, ErrorMonitor, !shell_mass_prepared);
shell_mass_prepared = true;
} }
#else #else
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0, Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor); RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
#endif #endif
#else #else
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before surface integral"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before surface integral");
#ifdef WithShell #ifdef WithShell
if (lev > 0 || Rex < GH->bbox[0][0][3]) if (lev > 0 || Rex < GH->bbox[0][0][3])
{ {
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor); Waveshell->surf_WaveMassPAng(Rex, lev, GH,
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, phi0, trK0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
RoutMAP, ErrorMonitor); Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
} }
else else
{ {
Waveshell->surf_Wave(Rex, lev, SH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor); Waveshell->surf_WaveMassPAng(Rex, lev, SH,
Waveshell->surf_MassPAng(Rex, lev, SH, phi0, trK0, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, phi0, trK0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
RoutMAP, ErrorMonitor); Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !shell_mass_prepared);
shell_mass_prepared = true;
} }
#else #else
#if (PSTR == 0) #if (PSTR == 0)
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor); Waveshell->surf_WaveMassPAng(Rex, lev, GH,
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, phi0, trK0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
RoutMAP, ErrorMonitor); Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
#elif (PSTR == 1 || PSTR == 2) #elif (PSTR == 1 || PSTR == 2)
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor, GH->Commlev[lev]); Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor, GH->Commlev[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after surf_Wave"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after surf_Wave");
@@ -7176,7 +7513,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0, gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0, Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor, GH->Commlev[lev]); RoutMAP, ErrorMonitor, GH->Commlev[lev], !patch_mass_prepared);
patch_mass_prepared = true;
#endif #endif
#endif #endif
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral");
@@ -7249,7 +7587,7 @@ void bssn_class::Constraint_Out()
for (int lev = 0; lev < GH->levels; lev++) for (int lev = 0; lev < GH->levels; lev++)
{ {
// make sure the data consistent for higher levels // make sure the data consistent for higher levels
if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation if (lev > 0 && ConstraintRefreshLevels && ConstraintRefreshLevels[lev]) // only refresh levels whose grid layout changed after evolution
{ {
double TRK4 = PhysTime; double TRK4 = PhysTime;
double ndeps = numepsb; double ndeps = numepsb;
@@ -7403,35 +7741,18 @@ void bssn_class::Constraint_Out()
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
double ConV_h[7]; double ConV_h[7];
#endif #endif
var *ConstraintVars[7] = {Cons_Ham, Cons_Px, Cons_Py, Cons_Pz, Cons_Gx, Cons_Gy, Cons_Gz};
#ifdef WithShell #ifdef WithShell
ConV[0] = SH->L2Norm(Cons_Ham); SH->L2Norm7(ConstraintVars, ConV);
ConV[1] = SH->L2Norm(Cons_Px);
ConV[2] = SH->L2Norm(Cons_Py);
ConV[3] = SH->L2Norm(Cons_Pz);
ConV[4] = SH->L2Norm(Cons_Gx);
ConV[5] = SH->L2Norm(Cons_Gy);
ConV[6] = SH->L2Norm(Cons_Gz);
ConVMonitor->writefile(PhysTime, 7, ConV); ConVMonitor->writefile(PhysTime, 7, ConV);
#endif #endif
for (int levi = 0; levi < GH->levels; levi++) for (int levi = 0; levi < GH->levels; levi++)
{ {
#if (PSTR == 0) #if (PSTR == 0)
ConV[0] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Ham); Parallel::L2Norm7(GH->PatL[levi]->data, ConstraintVars, ConV);
ConV[1] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Px);
ConV[2] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Py);
ConV[3] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Pz);
ConV[4] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gx);
ConV[5] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gy);
ConV[6] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gz);
#elif (PSTR == 1 || PSTR == 2) #elif (PSTR == 1 || PSTR == 2)
ConV[0] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Ham, GH->Commlev[levi]); Parallel::L2Norm7(GH->PatL[levi]->data, ConstraintVars, ConV, GH->Commlev[levi]);
ConV[1] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Px, GH->Commlev[levi]);
ConV[2] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Py, GH->Commlev[levi]);
ConV[3] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Pz, GH->Commlev[levi]);
ConV[4] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gx, GH->Commlev[levi]);
ConV[5] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gy, GH->Commlev[levi]);
ConV[6] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gz, GH->Commlev[levi]);
// misc::tillherecheck("before collect data to cpu0"); // misc::tillherecheck("before collect data to cpu0");
// MPI_ALLREDUCE( sendbuf, recvbuf, count, datatype, op, comm), sendbu and recvbuf must be different // MPI_ALLREDUCE( sendbuf, recvbuf, count, datatype, op, comm), sendbu and recvbuf must be different
if (levi > 0) if (levi > 0)
@@ -7462,6 +7783,9 @@ void bssn_class::Constraint_Out()
Interp_Constraint(false); Interp_Constraint(false);
LastConsOut = 0; LastConsOut = 0;
if (ConstraintRefreshLevels)
for (int lev = 0; lev < GH->levels; lev++)
ConstraintRefreshLevels[lev] = 0;
} }
} }

View File

@@ -48,6 +48,7 @@ public:
double StartTime, TotalTime; double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime; double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut; double LastAnas, LastConsOut;
int *ConstraintRefreshLevels;
double Courant; double Courant;
double numepss, numepsb, numepsh; double numepss, numepsb, numepsh;
int Symmetry; int Symmetry;
@@ -130,9 +131,11 @@ public:
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;
checkpoint *CheckPoint; checkpoint *CheckPoint;

View File

@@ -62,6 +62,7 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong ! gont = 0: success; gont = 1: something wrong
integer::gont integer::gont
integer :: i,j,k
!~~~~~~> Other variables: !~~~~~~> Other variables:
@@ -85,6 +86,13 @@
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI real*8 :: dX, dY, dZ, PI
real*8 :: divb_loc,det_loc
real*8 :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
real*8 :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
real*8 :: fxx_loc,fxy_loc,fxz_loc
real*8 :: Gamxa_loc,Gamya_loc,Gamza_loc
real*8 :: f_loc,chin_loc
real*8 :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0 real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0 real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0 real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
@@ -97,7 +105,7 @@
#endif #endif
#if (GAUGE == 6 || GAUGE == 7) #if (GAUGE == 6 || GAUGE == 7)
integer :: BHN,i,j,k integer :: BHN
real*8, dimension(9) :: Porg real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2 real*8 :: r1,r2,M,A,w1,w2,C1,C2
@@ -145,22 +153,24 @@
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
dZ = Z(2) - Z(1) dZ = Z(2) - Z(1)
alpn1 = Lap + ONE do k=1,ex(3)
chin1 = chi + ONE do j=1,ex(2)
gxx = dxx + ONE do i=1,ex(1)
gyy = dyy + ONE alpn1(i,j,k) = Lap(i,j,k) + ONE
gzz = dzz + ONE chin1(i,j,k) = chi(i,j,k) + ONE
gxx(i,j,k) = dxx(i,j,k) + ONE
gyy(i,j,k) = dyy(i,j,k) + ONE
gzz(i,j,k) = dzz(i,j,k) + ONE
enddo
enddo
enddo
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev) call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev) call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev) call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
@@ -168,151 +178,179 @@
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & do k=1,ex(3)
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) do j=1,ex(2)
do i=1,ex(1)
divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
div_beta(i,j,k) = divb_loc
gyy_rhs = - TWO * alpn1 * Ayy - F2o3 * gyy * div_beta + & chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
TWO *( gxy * betaxy + gyy * betayy + gyz * betazy)
gzz_rhs = - TWO * alpn1 * Azz - F2o3 * gzz * div_beta + & gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
TWO *( gxz * betaxz + gyz * betayz + gzz * betazz) TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
gxy_rhs = - TWO * alpn1 * Axy + F1o3 * gxy * div_beta + & gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
gxx * betaxy + gxz * betazy + & TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
gyy * betayx + gyz * betazx &
- gxy * betazz
gyz_rhs = - TWO * alpn1 * Ayz + F1o3 * gyz * div_beta + & gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
gxy * betaxz + gyy * betayz + & TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
gxz * betaxy + gzz * betazy &
- gyz * betaxx
gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + & gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
gxx * betaxz + gxy * betayz + & gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
gyz * betayx + gzz * betazx & gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
- gxz * betayy !rhs for gij
! invert tilted metric gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
gupxx = ( gyy * gzz - gyz * gyz ) / gupzz
gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
gupxz = ( gxy * gyz - gyy * gxz ) / gupzz
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
if(co == 0)then gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
! Gam^i_Res = Gam^i + gup^ij_,j gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)& gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
+gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
endif
! second kind of connection det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
gupxx(i,j,k) = gupxx_loc
gupxy(i,j,k) = gupxy_loc
gupxz(i,j,k) = gupxz_loc
gupyy(i,j,k) = gupyy_loc
gupyz(i,j,k) = gupyz_loc
gupzz(i,j,k) = gupzz_loc
Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz )) if(co == 0)then
Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz )) Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz )) gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
endif
Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz) Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz) Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz) Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) ) Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) ) Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) ) Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx ) Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx ) Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx ) Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy ) Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy ) Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy ) Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
enddo
enddo
enddo
! Raise indices of \tilde A_{ij} and store in R_ij ! Raise indices of \tilde A_{ij} and store in R_ij
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
Ryy = gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
Rzz = gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
Rxy = gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
(gupxx * gupyy + gupxy * gupxy)* Axy + &
(gupxx * gupyz + gupxz * gupxy)* Axz + &
(gupxy * gupyz + gupxz * gupyy)* Ayz
Rxz = gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
(gupxx * gupyz + gupxy * gupxz)* Axy + &
(gupxx * gupzz + gupxz * gupxz)* Axz + &
(gupxy * gupzz + gupxz * gupyz)* Ayz
Ryz = gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
(gupxy * gupyz + gupyy * gupxz)* Axy + &
(gupxy * gupzz + gupyz * gupxz)* Axz + &
(gupyy * gupzz + gupyz * gupyz)* Ayz
! Right hand side for Gam^i without shift terms... ! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
gupxx_loc = gupxx(i,j,k)
gupxy_loc = gupxy(i,j,k)
gupxz_loc = gupxz(i,j,k)
gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k)
Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + & Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
TWO * alpn1 * ( & TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
-F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - & Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - & TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + & Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) ) (gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
Rxx(i,j,k) = Rxx_loc
Ryy(i,j,k) = Ryy_loc
Rzz(i,j,k) = Rzz_loc
Rxy(i,j,k) = Rxy_loc
Rxz(i,j,k) = Rxz_loc
Ryz(i,j,k) = Ryz_loc
Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + & Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
TWO * alpn1 * ( & TWO * alpn1(i,j,k) * ( &
-F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - & -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - & gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + & Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) ) TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + & Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
TWO * alpn1 * ( & TWO * alpn1(i,j,k) * ( &
-F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - & -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - & gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - & gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + & Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) ) TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
enddo
enddo
enddo
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,& call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev) X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -321,38 +359,54 @@
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,& call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev) X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
fxx = gxxx + gxyy + gxzz
fxy = gxyx + gyyy + gyzz
fxz = gxzx + gyzy + gzzz
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
divb_loc = div_beta(i,j,k)
fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - & gupxx_loc = gupxx(i,j,k)
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + & gupxy_loc = gupxy(i,j,k)
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + & gupxz_loc = gupxz(i,j,k)
gupxx * gxxx + gupyy * gyyx + gupzz * gzzx + & gupyy_loc = gupyy(i,j,k)
TWO * (gupxy * gxyx + gupxz * gxzx + gupyz * gyzx ) gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k)
Gamy_rhs = Gamy_rhs + F2o3 * Gamya * div_beta - & Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
Gamxa * betayx - Gamya * betayy - Gamza * betayz + & TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
F1o3 * (gupxy * fxx + gupyy * fxy + gupyz * fxz ) + & Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
gupxx * gxxy + gupyy * gyyy + gupzz * gzzy + & TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
TWO * (gupxy * gxyy + gupxz * gxzy + gupyz * gyzy ) Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
Gamxa(i,j,k) = Gamxa_loc
Gamya(i,j,k) = Gamya_loc
Gamza(i,j,k) = Gamza_loc
Gamz_rhs = Gamz_rhs + F2o3 * Gamza * div_beta - & Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
Gamxa * betazx - Gamya * betazy - Gamza * betazz + & Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
F1o3 * (gupxz * fxx + gupyz * fxy + gupzz * fxz ) + & F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
gupxx * gxxz + gupyy * gyyz + gupzz * gzzz + & gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
TWO * (gupxy * gxyz + gupxz * gxzz + gupyz * gyzz ) !rhs for Gam^i TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
enddo
enddo
enddo
!first kind of connection stored in gij,k !first kind of connection stored in gij,k
gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -604,189 +658,187 @@
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz do k=1,ex(3)
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz do j=1,ex(2)
fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz do i=1,ex(1)
fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + & chin_loc = chin1(i,j,k)
gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + & f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + & gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + & gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + & TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
! Add chi part to Ricci tensor: TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
f(i,j,k) = f_loc
Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
enddo
enddo
enddo
! covariant second derivatives of the lapse respect to physical metric ! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev) SYM,SYM,SYM,symmetry,Lev)
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 do k=1,ex(3)
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 do j=1,ex(2)
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 do i=1,ex(1)
! now get physical second kind of connection chin_loc = chin1(i,j,k)
Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF
Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF
Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF
Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF
Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF
Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF
Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF
Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF
Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF
Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF
Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF
Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF
Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF
fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz Gamyxx(i,j,k) = Gamyxx(i,j,k) - ( - gxx(i,j,k) * gxxy(i,j,k) )*HALF
fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz Gamzxx(i,j,k) = Gamzxx(i,j,k) - ( - gxx(i,j,k) * gxxz(i,j,k) )*HALF
fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz Gamxyy(i,j,k) = Gamxyy(i,j,k) - ( - gyy(i,j,k) * gxxx(i,j,k) )*HALF
fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz Gamzyy(i,j,k) = Gamzyy(i,j,k) - ( - gyy(i,j,k) * gxxz(i,j,k) )*HALF
Gamxzz(i,j,k) = Gamxzz(i,j,k) - ( - gzz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyzz(i,j,k) = Gamyzz(i,j,k) - ( - gzz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxy(i,j,k) = Gamzxy(i,j,k) - ( - gxy(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxz(i,j,k) = Gamyxz(i,j,k) - ( - gxz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxyz(i,j,k) = Gamxyz(i,j,k) - ( - gyz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
! store D^i D_i Lap in trK_rhs upto chi fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
trK_rhs = gupxx * fxx + gupyy * fyy + gupzz * fzz + & fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
#if 1 fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
!! follow bam code fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
S = chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + & fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
f = F2o3 * trK * trK -(&
gupxx * ( &
gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
gupyy * ( &
gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
gupzz * ( &
gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
TWO * ( &
gupxy * ( &
gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy * (Axx * Ayy + Axy * Axy) + &
gupxz * (Axx * Ayz + Axz * Axy) + &
gupyz * (Axy * Ayz + Axz * Ayy) ) + &
gupxz * ( &
gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxy * (Axx * Ayz + Axy * Axz) + &
gupxz * (Axx * Azz + Axz * Axz) + &
gupyz * (Axy * Azz + Axz * Ayz) ) + &
gupyz * ( &
gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxy * (Axy * Ayz + Ayy * Axz) + &
gupxz * (Axy * Azz + Ayz * Axz) + &
gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz enddo
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy enddo
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz enddo
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz do k=1,ex(3)
#else do j=1,ex(2)
! Add lapse and S_ij parts to Ricci tensor: do i=1,ex(1)
divb_loc = div_beta(i,j,k)
chin_loc = chin1(i,j,k)
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz S(i,j,k) = S_loc
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
! Compute trace-free part (note: chi^-1 and chi cancel!): f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
f = F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + & f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) ) TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
#endif alpn1(i,j,k)/chin_loc * f_loc )
f(i,j,k) = f_loc
Axx_rhs = fxx - gxx * f l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
Ayy_rhs = fyy - gyy * f l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
Azz_rhs = fzz - gzz * f l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
Axy_rhs = fxy - gxy * f l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
Axz_rhs = fxz - gxz * f l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
Ayz_rhs = fyz - gyz * f l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
! Now: store A_il A^l_j into fij: Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
fxx = gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + & fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
fyy = gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + & gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
fzz = gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + & gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
fxy = gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + & fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupxy *(Axx * Ayy + Axy * Axy) + & gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxz *(Axx * Ayz + Axz * Axy) + & gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
gupyz *(Axy * Ayz + Axz * Ayy) fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
fxz = gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + & gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxy *(Axx * Ayz + Axy * Axz) + & gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupxz *(Axx * Azz + Axz * Axz) + & gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
gupyz *(Axy * Azz + Axz * Ayz) fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
fyz = gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + & gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxy *(Axy * Ayz + Ayy * Axz) + & gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupxz *(Axy * Azz + Ayz * Axz) + & gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
gupyz *(Ayy * Azz + Ayz * Ayz) fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
f = chin1 trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
! store D^i D_i Lap in trK_rhs
trK_rhs = f*trK_rhs
Axx_rhs = f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx) + & Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
TWO * ( Axx * betaxx + Axy * betayx + Axz * betazx )- & TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
F2o3 * Axx * div_beta F2o3 * Axx(i,j,k) * divb_loc
Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
F2o3 * Ayy(i,j,k) * divb_loc
Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
F2o3 * Azz(i,j,k) * divb_loc
Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
Ayy_rhs = f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy) + & trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
TWO * ( Axy * betaxy + Ayy * betayy + Ayz * betazy )- & gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
F2o3 * Ayy * div_beta TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
FOUR * PI * (rho(i,j,k) + S_loc) )
Azz_rhs = f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz) + & enddo
TWO * ( Axz * betaxz + Ayz * betayz + Azz * betazz )- & enddo
F2o3 * Azz * div_beta enddo
Axy_rhs = f * Axy_rhs+ alpn1 *( trK * Axy - TWO * fxy )+ &
Axx * betaxy + Axz * betazy + &
Ayy * betayx + Ayz * betazx + &
F1o3 * Axy * div_beta - Axy * betazz
Ayz_rhs = f * Ayz_rhs+ alpn1 *( trK * Ayz - TWO * fyz )+ &
Axy * betaxz + Ayy * betayz + &
Axz * betaxy + Azz * betazy + &
F1o3 * Ayz * div_beta - Ayz * betaxx
Axz_rhs = f * Axz_rhs+ alpn1 *( trK * Axz - TWO * fxz )+ &
Axx * betaxz + Axy * betayz + &
Ayz * betayx + Azz * betazx + &
F1o3 * Axz * div_beta - Axz * betayy !rhs for Aij
! Compute trace of S_ij
S = f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK + &
gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
FOUR * PI * ( rho + S )) !rhs for trK
!!!! gauge variable part !!!! gauge variable part
@@ -948,15 +1000,15 @@
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and ! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables ! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero, ! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation. ! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)

View File

@@ -32,6 +32,19 @@
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_ #define f_compute_constraint_fr compute_constraint_fr_
#endif #endif
#ifdef __cplusplus
extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C" extern "C"
{ {
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z

View File

@@ -2,12 +2,88 @@
#include "bssn_rhs.h" #include "bssn_rhs.h"
#include "share_func.h" #include "share_func.h"
#include "tool.h" #include "tool.h"
#include <time.h>
// 0-based i,j,k // 0-based i,j,k
// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
// ex(1)=nx, ex(2)=ny, ex(3)=nz // ex(1)=nx, ex(2)=ny, ex(3)=nz
// 用法a[ IDX_F(i,j,k,nx,ny) ] // 用法a[ IDX_F(i,j,k,nx,ny) ]
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing
{
enum Bucket
{
KB_SETUP_DERIVS = 0,
KB_GEOM_GAMMA,
KB_RICCI_METRIC,
KB_CHI_LAPSE,
KB_AIJ_TRK_GAUGE,
KB_KO_CONSTRAINT,
KB_COUNT
};
static double local_bucket_seconds[KB_COUNT];
static const char *bucket_labels[KB_COUNT] =
{
"setup_derivs",
"geom_gamma",
"ricci_metric",
"chi_lapse",
"aij_trk_gauge",
"ko_constraint"
};
static inline double now_seconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec);
}
}
extern "C" void f_bssn_rhs_kernel_timing_reset()
{
for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i)
rhs_kernel_timing::local_bucket_seconds[i] = 0.0;
}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count()
{
return rhs_kernel_timing::KB_COUNT;
}
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds()
{
return rhs_kernel_timing::local_bucket_seconds;
}
extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index)
{
if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT)
return "unknown";
return rhs_kernel_timing::bucket_labels[bucket_index];
}
#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds()
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \
rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \
rhs_kernel_timing::now_seconds() - (var_name)
#else
extern "C" void f_bssn_rhs_kernel_timing_reset() {}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; }
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; }
extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; }
#define RHS_KERNEL_TIMER_DECL(var_name)
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name)
#endif
// C function that calculates the right-hand side for BSSN equations // C function that calculates the right-hand side for BSSN equations
int f_compute_rhs_bssn(int *ex, double &T, int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z, double *X, double *Y, double *Z,
@@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0]; dY = Y[1] - Y[0];
dZ = Z[1] - Z[0]; dZ = Z[1] - Z[0];
RHS_KERNEL_TIMER_DECL(timer_setup_derivs);
// 1ms // // 1ms //
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0; alpn1[i] = Lap[i] + 1.0;
@@ -141,6 +218,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i] (dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i]; + (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
} }
RHS_KERNEL_TIMER_ADD(KB_SETUP_DERIVS, timer_setup_derivs);
RHS_KERNEL_TIMER_DECL(timer_geom_gamma);
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1) // Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] - double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
@@ -283,9 +362,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i] + ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i] + ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i]; + ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) + Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
TWO * alpn1[i] * ( TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) - -F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
@@ -315,6 +391,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz ) + TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
); );
} }
RHS_KERNEL_TIMER_ADD(KB_GEOM_GAMMA, timer_geom_gamma);
RHS_KERNEL_TIMER_DECL(timer_ricci_metric);
// 22.3ms // // 22.3ms //
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx, fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev); X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
@@ -332,7 +410,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
double lfxx = gxxx[i] + gxyy[i] + gxzz[i]; double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
double lfxy = gxyx[i] + gyyy[i] + gyzz[i]; double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
double lfxz = gxzx[i] + gyzy[i] + gzzz[i]; double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i] double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] ); + TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
@@ -686,69 +763,74 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i] + Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i]
); );
} }
RHS_KERNEL_TIMER_ADD(KB_RICCI_METRIC, timer_ricci_metric);
RHS_KERNEL_TIMER_DECL(timer_chi_lapse);
// 22.3ms // // 22.3ms //
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 7ms // // 7ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i]; const double inv_chin1 = ONE / chin1[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i]; const double half_inv_chin1 = HALF * inv_chin1;
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i]; const double scaled_inv = F3o2 * inv_chin1;
fyy[i] = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i]; const double cxx = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fyz[i] = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i]; const double cxy = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fzz[i] = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i]; const double cxz = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
f[i] = const double cyy = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
gupxx[i] * (fxx[i] - (F3o2 / chin1[i]) * chix[i] * chix[i]) const double cyz = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
+ gupyy[i] * (fyy[i] - (F3o2 / chin1[i]) * chiy[i] * chiy[i]) const double czz = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
+ gupzz[i] * (fzz[i] - (F3o2 / chin1[i]) * chiz[i] * chiz[i]) const double ricci_chi =
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i]) gupxx[i] * (cxx - scaled_inv * chix[i] * chix[i])
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i]) + gupyy[i] * (cyy - scaled_inv * chiy[i] * chiy[i])
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]); + gupzz[i] * (czz - scaled_inv * chiz[i] * chiz[i])
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxy[i] * (cxy - scaled_inv * chix[i] * chiy[i])
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxz[i] * (cxz - scaled_inv * chix[i] * chiz[i])
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupyz[i] * (cyz - scaled_inv * chiy[i] * chiz[i]);
f[i] = ricci_chi;
Rxx[i] = Rxx[i] + ( cxx - half_inv_chin1 * chix[i] * chix[i] + (dxx[i] + ONE) * ricci_chi ) * half_inv_chin1;
Ryy[i] = Ryy[i] + ( cyy - half_inv_chin1 * chiy[i] * chiy[i] + (dyy[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rzz[i] = Rzz[i] + ( czz - half_inv_chin1 * chiz[i] * chiz[i] + (dzz[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO); Rxy[i] = Rxy[i] + ( cxy - half_inv_chin1 * chix[i] * chiy[i] + gxy[i] * ricci_chi ) * half_inv_chin1;
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO); Rxz[i] = Rxz[i] + ( cxz - half_inv_chin1 * chix[i] * chiz[i] + gxz[i] * ricci_chi ) * half_inv_chin1;
Ryz[i] = Ryz[i] + ( fyz[i] - (chiy[i] * chiz[i]) / (chin1[i] * TWO) + gyz[i] * f[i] ) / (chin1[i] * TWO); Ryz[i] = Ryz[i] + ( cyz - half_inv_chin1 * chiy[i] * chiz[i] + gyz[i] * ricci_chi ) * half_inv_chin1;
} }
// 24ms // // 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms // // 6ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */ const double inv_chin1 = ONE / chin1[i];
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i]; const double gchi_x = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) * inv_chin1;
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i]; const double gchi_y = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) * inv_chin1;
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i]; const double gchi_z = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) * inv_chin1;
/* Christoffel 修正项 */ /* Christoffel 修正项 */
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF; Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) * inv_chin1) - (dxx[i] + ONE) * gchi_x ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */ Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_y ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF; Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_z ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF; Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_x ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF; Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) * inv_chin1) - (dyy[i] + ONE) * gchi_y ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF; Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_z ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF; Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_x ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF; Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_y ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF; Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) * inv_chin1) - (dzz[i] + ONE) * gchi_z ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF; Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] * inv_chin1) - gxy[i] * gchi_x ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF; Gamyxy[i] = Gamyxy[i] - ( ( chix[i] * inv_chin1) - gxy[i] * gchi_y ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gxxz[i] ) * HALF; Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gchi_z ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] / chin1[i]) - gxz[i] * gxxx[i] ) * HALF; Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] * inv_chin1) - gxz[i] * gchi_x ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gxxy[i] ) * HALF; Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gchi_y ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] / chin1[i]) - gxz[i] * gxxz[i] ) * HALF; Gamzxz[i] = Gamzxz[i] - ( ( chix[i] * inv_chin1) - gxz[i] * gchi_z ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gxxx[i] ) * HALF; Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gchi_x ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] / chin1[i]) - gyz[i] * gxxy[i] ) * HALF; Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] * inv_chin1) - gyz[i] * gchi_y ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] / chin1[i]) - gyz[i] * gxxz[i] ) * HALF; Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] * inv_chin1) - gyz[i] * gchi_z ) * HALF;
/* fxx..fyz 修正:减去 Γ * ∂Lap */ /* fxx..fyz 修正:减去 Γ * ∂Lap */
fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i]; fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i];
@@ -762,6 +844,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] ); + TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
} }
RHS_KERNEL_TIMER_ADD(KB_CHI_LAPSE, timer_chi_lapse);
RHS_KERNEL_TIMER_DECL(timer_aij_trk_gauge);
// 2.5ms // // 2.5ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
const double divb = betaxx[i] + betayy[i] + betazz[i]; const double divb = betaxx[i] + betayy[i] + betazz[i];
@@ -1062,6 +1146,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i]; dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#endif #endif
} }
RHS_KERNEL_TIMER_ADD(KB_AIJ_TRK_GAUGE, timer_aij_trk_gauge);
RHS_KERNEL_TIMER_DECL(timer_ko_constraint);
// advection + KO dissipation with shared symmetry buffer // advection + KO dissipation with shared symmetry buffer
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps); lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps); lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
@@ -1139,60 +1225,61 @@ int f_compute_rhs_bssn(int *ex, double &T,
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0); fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
} // 7ms //
// 7ms // for (int i=0;i<all;i+=1) {
for (int i=0;i<all;i+=1) { gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i] + Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i]; gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i] + Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
+ Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i]; gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
gxzx[i] = gxzx[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i] + Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
+ Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i]; gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
gyyx[i] = gyyx[i] - ( Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i] + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i]; gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
gyzx[i] = gyzx[i] - ( Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i] + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i]; gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
gzzx[i] = gzzx[i] - ( Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i] + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i]; gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
gxxy[i] = gxxy[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i] + Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
+ Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i]; gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
gxyy[i] = gxyy[i] - ( Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i] + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
+ Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i]; gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
gxzy[i] = gxzy[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i] + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
+ Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i]; gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
gyyy[i] = gyyy[i] - ( Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i] + Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
+ Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i]; gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
gyzy[i] = gyzy[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i] + Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
+ Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i]; gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
gzzy[i] = gzzy[i] - ( Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i] + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i]; gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
gxxz[i] = gxxz[i] - ( Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i] + Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
+ Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i]; gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
gxyz[i] = gxyz[i] - ( Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i] + Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
+ Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i]; gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
gxzz[i] = gxzz[i] - ( Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i] + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
+ Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i]; gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
gyyz[i] = gyyz[i] - ( Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i] + Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
+ Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i]; gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
gyzz[i] = gyzz[i] - ( Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i] + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
+ Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i]; gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
gzzz[i] = gzzz[i] - ( Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i] + Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
+ Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i] movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
+ gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
+ gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i]; + gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i] movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
+ gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
+ gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i]; + gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i] movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
+ gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i] + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
+ gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i]; + gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i]; movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i]; movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i]; movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
}
} }
RHS_KERNEL_TIMER_ADD(KB_KO_CONSTRAINT, timer_ko_constraint);

View File

@@ -1513,6 +1513,7 @@
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh
real*8, dimension(3) :: SoA real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2 integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
@@ -1565,9 +1566,47 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
i_core_min = max(1, imin+2)
i_core_max = min(ex(1), imax-2)
j_core_min = max(1, jmin+2)
j_core_max = min(ex(2), jmax-2)
k_core_min = max(1, kmin+2)
k_core_max = min(ex(3), kmax-2)
if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
do k=k_core_min,k_core_max
do j=j_core_min,j_core_max
do i=i_core_min,i_core_max
! interior points always use 4th-order stencils without branch checks
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
enddo
enddo
enddo
endif
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
if(i>=i_core_min .and. i<=i_core_max .and. &
j>=j_core_min .and. j<=j_core_max .and. &
k>=k_core_min .and. k<=k_core_max) cycle
!~~~~~~ fxx !~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !

View File

@@ -1514,6 +1514,81 @@ f_out = f_out*dX*dY*dZ
return return
end subroutine l2normhelper end subroutine l2normhelper
!--------------------------------------------------------------------------------------
subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f1,f2,f3,f4,f5,f6,f7,f_out,gw)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks ! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&

View File

@@ -13,6 +13,7 @@
#define f_global_interpind2d global_interpind2d #define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d #define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper #define f_l2normhelper l2normhelper
#define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh l2normhelper_sh #define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average #define f_average average
@@ -42,6 +43,7 @@
#define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D #define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER #define f_l2normhelper L2NORMHELPER
#define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh L2NORMHELPER_SH #define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE #define f_average AVERAGE
@@ -71,6 +73,7 @@
#define f_global_interpind2d global_interpind2d_ #define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_ #define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_ #define f_l2normhelper l2normhelper_
#define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh l2normhelper_sh_ #define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_ #define f_average average_
@@ -164,6 +167,15 @@ extern "C"
double *, double &, int &); double *, double &, int &);
} }
extern "C"
{
void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C" extern "C"
{ {
void f_l2normhelper_sh(int *, double *, double *, double *, void f_l2normhelper_sh(int *, double *, double *, double *,

View File

@@ -17,65 +17,103 @@ using namespace std;
#include <math.h> #include <math.h>
#endif #endif
// Intel oneMKL LAPACK interface /* Linear equation solution by Gauss-Jordan elimination.
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. corresponding set of solution vectors. */
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
int gaussj(double *a, double *b, int n) int gaussj(double *a, double *b, int n)
{ {
// Allocate pivot array and workspace double swap;
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
// Make a copy of matrix a for solving (dgesv modifies it to LU form) int *indxc, *indxr, *ipiv;
double *a_copy = new double[n * n]; indxc = new int[n];
for (int i = 0; i < n * n; i++) { indxr = new int[n];
a_copy[i] = a[i]; ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
return 1;
}
}
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
return 1;
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
} }
// Step 1: Solve linear system A*x = b using LU decomposition for (l = n - 1; l >= 0; l--)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
if (info != 0) { {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; swap = a[k * n + indxr[l]];
delete[] ipiv; a[k * n + indxr[l]] = a[k * n + indxc[l]];
delete[] a_copy; a[k * n + indxc[l]] = swap;
return 1; }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
} }
delete[] indxc;
delete[] indxr;
delete[] ipiv; delete[] ipiv;
delete[] a_copy;
return 0; return 0;
} }

View File

@@ -29,6 +29,16 @@
#define REGLEV 0 #define REGLEV 0
#define BSSN_FINE_TIMING 0
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU //#define USE_GPU
//#define CHECKDETAIL //#define CHECKDETAIL
@@ -88,6 +98,21 @@
// 0: for every level; // 0: for every level;
// 1: for all // 1: for all
// //
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU // define USE_GPU
// use gpu or not // use gpu or not
// //
@@ -142,4 +167,3 @@
#define TINY 1e-10 #define TINY 1e-10
#endif /* MICRODEF_H */ #endif /* MICRODEF_H */

View File

@@ -8,27 +8,16 @@ include makefile.inc
POLINT6_USE_BARY ?= 1 POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) ## Legacy GNU/OpenMPI flags
## make -> opt (PGO-guided, maximum performance) CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(PGO_MODE),instrument) ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = $(CXXBASEFLAGS)
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ f90appflags = $(F90BASEFLAGS)
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ CXXAPPFLAGS = $(CXXBASEFLAGS)
## PGO has been turned off, now tested and found to be negative optimization f90appflags = $(F90BASEFLAGS)
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
@@ -64,20 +53,17 @@ lopsided_c.o: lopsided_c.C
lopsided_kodis_c.o: lopsided_kodis_c.C lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(TP_OPTFLAGS) -c $< -o $@
# Input files # Input files
@@ -185,7 +171,7 @@ ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILE
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

56
AMSS_NCKU_source/makefile.inc Executable file → Normal file
View File

@@ -1,33 +1,27 @@
## GCC version (commented out) ## Legacy GNU/OpenMPI toolchain configuration
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## OpenMPI wrappers are installed but may not be on PATH.
filein = -I/usr/include/ -I${MKLROOT}/include OMPI_BIN ?= /usr/lib64/openmpi/bin
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Wrapper compilers
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library f90 = $(OMPI_BIN)/mpifort
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 f77 = $(OMPI_BIN)/mpifort
CXX = $(OMPI_BIN)/mpicxx
CC = $(OMPI_BIN)/mpicc
CLINKER = $(OMPI_BIN)/mpicxx
## Memory allocator switch ## Extra include flags are not needed when using the OpenMPI wrappers.
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) filein =
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## BLAS/LAPACK backend:
## opt : (default) maximum performance with PGO profile-guided optimization ## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
## instrument : PGO Phase 1 instrumentation to collect fresh profile data BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
PGO_MODE ?= opt LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
## PGO build mode switch
## off : default legacy GNU build without PGO
## instrument : accepted for compatibility, currently same as off
PGO_MODE ?= off
## Interp_Points load balance profiling mode ## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation ## off : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
USE_CXX_KERNELS ?= 1 USE_CXX_KERNELS ?= 1
## RK4 kernel implementation switch ## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 1 (default) : use C/C++ rewrite of rungekutta4_rout
## 0 : use original Fortran rungekutta4_rout.o ## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1 USE_CXX_RK4 ?= 1
f90 = ifx ## OpenMP is only used for TwoPunctures on the legacy toolchain.
f77 = ifx TP_OPENMP_FLAGS ?= -fopenmp
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

View File

@@ -1956,10 +1956,13 @@
real*8,dimension(3) :: CD,FD real*8,dimension(3) :: CD,FD
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
real*8 :: v1, v2, v3, v4, v5, v6 real*8 :: v1, v2, v3, v4, v5, v6
integer :: ic, jc, kc, ix_offset,ix,iy,iz integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
logical :: need_full_symmetry
real*8 :: res_line real*8 :: res_line
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
if(wei.ne.3)then if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei write(*,*)"dim = ",wei
@@ -2062,17 +2065,60 @@
endif endif
enddo enddo
maxcx = maxval(cix(imino:imaxo)) ic_min = minval(cix(imino:imaxo))
maxcy = maxval(ciy(jmino:jmaxo)) ic_max = maxval(cix(imino:imaxo))
maxcz = maxval(ciz(kmino:kmaxo)) jc_min = minval(ciy(jmino:jmaxo))
jc_max = maxval(ciy(jmino:jmaxo))
kc_min = minval(ciz(kmino:kmaxo))
kc_max = maxval(ciz(kmino:kmaxo))
maxcx = ic_max
maxcy = jc_max
maxcz = kc_max
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
write(*,*)"error in prolong" write(*,*)"error in prolong"
return return
endif endif
call symmetry_bd(3,extc,func,funcc,SoA) i_lo = ic_min - 2
i_hi = ic_max + 3
j_lo = jc_min - 2
j_hi = jc_max + 3
k_lo = kc_min - 2
k_hi = kc_max + 3
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(3,extc,func,funcc,SoA)
else
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
endif
! 对每个 kpz, kc 固定)预计算 Z 向插值的 2D 切片
do k = kmino, kmaxo
pz = piz(k); kc = ciz(k)
! --- Pass 1: Z 方向,只算一次 ---
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
end do
end do
do j = jmino, jmaxo
py = piy(j); jc = ciy(j)
! --- Pass 2: Y 方向 ---
do ii = ic_min-2, ic_max+3
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
end do
! --- Pass 3: X 方向 ---
do i = imino, imaxo
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
end do
end do
end do
!~~~~~~> prolongation start... !~~~~~~> prolongation start...
#if 0
do k = kmino, kmaxo do k = kmino, kmaxo
pz = piz(k) pz = piz(k)
kc = ciz(k) kc = ciz(k)
@@ -2106,28 +2152,7 @@
#if 0
! 1. 【降维Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值
! 结果存入 tmp_yz(x_index, y_offset)
do jj = 1, 6
iy = jc - 3 + jj
do ii = 1, extc(1)
tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + &
WC(2,pz)*funcc(ii, iy, kc-1) + &
WC(3,pz)*funcc(ii, iy, kc ) + &
WC(4,pz)*funcc(ii, iy, kc+1) + &
WC(5,pz)*funcc(ii, iy, kc+2) + &
WC(6,pz)*funcc(ii, iy, kc+3)
end do
end do
! 2. 【降维Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值
do ii = 1, extc(1)
tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + &
WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + &
WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6)
end do
#endif
! 3. 【降维X 向】最后在最内层只处理 X 方向的 6 点加权 ! 3. 【降维X 向】最后在最内层只处理 X 方向的 6 点加权
! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次 ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
do i = imino, imaxo do i = imino, imaxo
@@ -2145,7 +2170,7 @@
end do end do
end do end do
end do end do
#endif
return return
end subroutine prolong3 end subroutine prolong3
@@ -2345,6 +2370,13 @@
real*8,dimension(3) :: CD,FD real*8,dimension(3) :: CD,FD
real*8 :: tmp_xz_plane(-1:extf(1), 6)
real*8 :: tmp_x_line(-1:extf(1))
integer :: fi, fj, fk, ii, jj, kk
integer :: fi_min, fi_max, ii_lo, ii_hi
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
logical :: need_full_symmetry
if(wei.ne.3)then if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei write(*,*)"dim = ",wei
@@ -2423,9 +2455,86 @@
stop stop
endif endif
call symmetry_bd(2,extf,funf,funff,SoA) ! 仅计算 X 向最终写回所需的窗口:
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
ii_lo = fi_min - 2
ii_hi = fi_max + 3
jj_lo = fj_min - 2
jj_hi = fj_max + 3
kk_lo = fk_min - 2
kk_hi = fk_max + 3
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
jj_lo < -1 .or. jj_hi > extf(2) .or. &
kk_lo < -1 .or. kk_hi > extf(3))then
write(*,*)"restrict3: invalid stencil window"
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
write(*,*)"extf=",extf
stop
endif
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(2,extf,funf,funff,SoA)
else
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
endif
!~~~~~~> restriction start... !~~~~~~> restriction start...
do k = kmino, kmaxo
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
do j = jmino, jmaxo
fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
! 确保 ii 循环是最内层且连续访问
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1))
tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1))
tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + &
C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + &
C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1))
tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1))
tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1))
tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1))
end do
! 优化点 2: 同样向量化 Y 方向压缩
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
end do
! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据
do i = imino, imaxo
fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
C3*(tmp_x_line(fi ) + tmp_x_line(fi+1))
end do
end do
end do
#if 0
do k = kmino,kmaxo do k = kmino,kmaxo
do j = jmino,jmaxo do j = jmino,jmaxo
do i = imino,imaxo do i = imino,imaxo
@@ -2449,7 +2558,7 @@
enddo enddo
enddo enddo
enddo enddo
#endif
return return
end subroutine restrict3 end subroutine restrict3

File diff suppressed because it is too large Load Diff

View File

@@ -36,6 +36,11 @@ private:
double *nx_g, *ny_g, *nz_g; // global list of unit normals double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize; int myrank, cpusize;
int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
double *wave_theta_pos, *wave_theta_neg;
double *wave_phi_cos, *wave_phi_sin;
void clear_wave_cache();
void build_wave_cache(int spinw, int maxl);
public: public:
surface_integral(int iSymmetry); surface_integral(int iSymmetry);
@@ -82,13 +87,29 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH, void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
@@ -115,7 +136,7 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here); double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here); monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -97,7 +97,9 @@ Here, we take the Ubuntu 22.04 system as an example
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
The settings for the Ubuntu 22.04 system do not need to be modified. The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
1. Enter the AMSS-NCKU Python code folder and modify the input. 1. Enter the AMSS-NCKU Python code folder and modify the input.

View File

@@ -144,6 +144,62 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 ) print( "#define REGLEV 0", file=file1 )
print( file=file1 ) print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU # Define macro USE_GPU
# use GPU or not # use GPU or not
@@ -224,6 +280,21 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 ) print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 ) print( "// 1: for all", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 ) print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 ) print( "// use gpu or not", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores ## Build parallelism: match the number of bound cores
BUILD_JOBS = 64 BUILD_JOBS = 64
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
################################################################## ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output