Compare commits

..

13 Commits

Author SHA1 Message Date
6f62e75245 Add conservative fmisc safe mode 2026-05-17 00:35:17 +08:00
3806e82c47 Prepare conservative plan-b build 2026-05-17 00:08:59 +08:00
359540af4b Fix C derivative ghost-buffer indexing across FD orders 2026-05-14 16:00:11 +08:00
a628b8e2ff Fix fourth-order C lopsided and KO stencil indexing 2026-05-14 15:19:36 +08:00
c6e1a125b8 Fix shell C kernel symbol names for Fortran linkage (fderivs_sh_ etc.)
Shell C functions must export Fortran-compatible symbols with trailing
underscore so bssn_rhs_ss.f90 and getnp4.f90 can link when WithShell is
active and USE_CXX_SHELL_KERNELS=1 replaces Fortran diff_new_sh.o.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 14:09:16 +08:00
547c62a116 Add full GAUGE 2-7 support to Z4C C RHS kernel (z4c_rhs_c.C)
Previously only GAUGE 0 and 1 were supported with a compile error for 2-7.
Now supports all 8 gauge choices matching BSSN Fortran formulas:
- GAUGE 2: variable-eta gamma-driver, chi-sqrt denominator
- GAUGE 3: variable-eta gamma-driver, chi-linear denominator
- GAUGE 4: first-order variable-eta, chi-sqrt denominator
- GAUGE 5: first-order variable-eta, chi-linear denominator
- GAUGE 6: Jason's rational position-dependent damping
- GAUGE 7: Jason's exponential position-dependent damping
Also fixes dtSf advection/dissipation guards for gauges where dtSf is active.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 13:00:07 +08:00
3e55068548 Add C kernel for BSSN-EM (Maxwell/electromagnetic field) RHS computation
New bssn_em_rhs_c.C computes EM field RHS (E,B,Kpsi,Kphi) and stress-energy
tensor, then calls the C BSSN RHS kernel with source terms. Replaces empart.f90
when USE_CXX_EM_KERNEL=1. Supports all ghost_width orders via existing derivative
kernels. Controlled by USE_CXX_EM_KERNEL switch (default 0, experimental).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 11:25:08 +08:00
4a413d669d Add C implementations of shell-patch derivative kernels (WithShell support)
New files provide C equivalents of Fortran diff_new_sh.f90 and kodiss_sh.f90:
- fderivs_sh_c.C: first derivatives in shell (rho, sigma, R) coords
- fdderivs_sh_c.C: second derivatives in shell coords
- fderivs_shc_c.C: shell first derivs + chain rule to Cartesian
- fdderivs_shc_c.C: shell second derivs + chain rule to Cartesian
- kodiss_sh_c.C: Kreiss-Oliger dissipation on shell patches

Also add symmetry_stbd() C implementation and shell fh indexing to share_func.h.
Controlled by USE_CXX_SHELL_KERNELS switch (default 0, experimental).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 02:52:48 +08:00
9e51793499 Add full FD order support (2nd/4th/6th/8th) to C derivative kernels via ghost_width dispatch
Wrap each C kernel in #if (ghost_width == N) blocks matching Fortran stencil
coefficients from diff_new.f90, kodiss.f90, and lopsidediff.f90. Add fast-path
indexing for ord=1,4,5 in share_func.h.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 02:27:21 +08:00
66bce8c42d Add plot-only restart script to skip recomputation when plotting is interrupted
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-12 14:56:38 +08:00
32529f09a6 Use static OpenMP schedule in ShellPatch::setupintintstuff
Static scheduling has lower overhead than guided for uniform workloads
(grid points all have equal computational cost).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 20:05:46 +08:00
fae4093863 Add thread-safe ShellPatch::setupintintstuff with OpenMP
Split prolongpointstru into search-only (prolongpointstru_search) and
append-only (prolongpointstru_append) functions. The search is read-only
and thread-safe; each thread builds private linked lists via
prolongpointstru_append, merged after the parallel loop.

This eliminates critical-section contention and delivers ~2.2x speedup:
setupintintstuff: 511s -> 252s, total init: 592s -> 267s.

Also add -qopenmp to ShellPatch.o compilation via makefile override rule
and <omp.h> include with _OPENMP guards + fallback stubs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 20:05:46 +08:00
26f5add3fb Accelerate Shell-Patch CPU interpolation 2026-05-08 14:30:15 +08:00
12 changed files with 484 additions and 296 deletions

View File

@@ -59,7 +59,7 @@ bool shell_fast_interp_enabled()
if (enabled < 0)
{
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
@@ -70,7 +70,7 @@ bool shell_parallel_interp_enabled()
if (enabled < 0)
{
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}

View File

@@ -27,7 +27,7 @@ using namespace std;
#endif
#include "TwoPunctures.h"
#include <cblas.h>
#include <mkl_cblas.h>
TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz,

View File

@@ -1075,10 +1075,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
#endif
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
#endif
for (int i = 0; i < all; i += 1) {
#if (GAUGE == 0)
betax_rhs[i] = FF * dtSfx[i];
@@ -1164,17 +1160,11 @@ int f_compute_rhs_bssn(int *ex, double &T,
lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
#endif
lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
#endif
lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
#endif
lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);

View File

@@ -496,7 +496,7 @@ void fdderivs(const int ex[3],
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
_Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
if (in4) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
@@ -628,7 +628,7 @@ void fdderivs(const int ex[3],
#elif (ghost_width == 5)
/* ---- 8th-order ----------------------------------------------------- */
{
const int ord = 5;
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
@@ -716,18 +716,18 @@ void fdderivs(const int ex[3],
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
_Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
if (in4) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Sdxdx * (fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] - TWO*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
fyy[p] = Sdydy * (fh[idx_fh_F_ord5(iF,jF-1,kF,ex)] - TWO*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
fzz[p] = Sdzdz * (fh[idx_fh_F_ord5(iF,jF,kF-1,ex)] - TWO*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
fxx[p] = Sdxdx * (fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
fyy[p] = Sdydy * (fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
fzz[p] = Sdzdz * (fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
fxy[p] = Sdxdy * (fh[idx_fh_F_ord5(iF-1,jF-1,kF,ex)] - fh[idx_fh_F_ord5(iF+1,jF-1,kF,ex)] - fh[idx_fh_F_ord5(iF-1,jF+1,kF,ex)] + fh[idx_fh_F_ord5(iF+1,jF+1,kF,ex)]);
fxz[p] = Sdxdz * (fh[idx_fh_F_ord5(iF-1,jF,kF-1,ex)] - fh[idx_fh_F_ord5(iF+1,jF,kF-1,ex)] - fh[idx_fh_F_ord5(iF-1,jF,kF+1,ex)] + fh[idx_fh_F_ord5(iF+1,jF,kF+1,ex)]);
fyz[p] = Sdydz * (fh[idx_fh_F_ord5(iF,jF-1,kF-1,ex)] - fh[idx_fh_F_ord5(iF,jF+1,kF-1,ex)] - fh[idx_fh_F_ord5(iF,jF-1,kF+1,ex)] + fh[idx_fh_F_ord5(iF,jF+1,kF+1,ex)]);
fxy[p] = Sdxdy * (fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]);
fxz[p] = Sdxdz * (fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]);
fyz[p] = Sdydz * (fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]);
}
}
}
@@ -740,34 +740,34 @@ void fdderivs(const int ex[3],
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
bool in6 = has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi;
_Bool in6 = has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi;
if (in6) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Fdxdx * (-fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + F16*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F_ord5(iF,jF,kF,ex)] - fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
fyy[p] = Fdydy * (-fh[idx_fh_F_ord5(iF,jF-2,kF,ex)] + F16*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F_ord5(iF,jF,kF,ex)] - fh[idx_fh_F_ord5(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
fzz[p] = Fdzdz * (-fh[idx_fh_F_ord5(iF,jF,kF-2,ex)] + F16*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F_ord5(iF,jF,kF,ex)] - fh[idx_fh_F_ord5(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
fxx[p] = Fdxdx * (-fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
fyy[p] = Fdydy * (-fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
fzz[p] = Fdzdz * (-fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
{
const double t_jm2 = (fh[idx_fh_F_ord5(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF-2,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF-2,kF,ex)]);
const double t_jm1 = (fh[idx_fh_F_ord5(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF-1,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF-1,kF,ex)]);
const double t_jp1 = (fh[idx_fh_F_ord5(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF+1,kF,ex)]);
const double t_jp2 = (fh[idx_fh_F_ord5(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF+2,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF+2,kF,ex)]);
const double t_jm2 = (fh[idx_fh_F_ord4(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-2,kF,ex)]);
const double t_jm1 = (fh[idx_fh_F_ord4(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-1,kF,ex)]);
const double t_jp1 = (fh[idx_fh_F_ord4(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+1,kF,ex)]);
const double t_jp2 = (fh[idx_fh_F_ord4(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+2,kF,ex)]);
fxy[p] = Fdxdy * (t_jm2 - F8*t_jm1 + F8*t_jp1 - t_jp2);
}
{
const double t_km2 = (fh[idx_fh_F_ord5(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF,kF-2,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord5(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF,kF-1,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord5(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord5(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F_ord5(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F_ord5(iF+1,jF,kF+2,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF+2,ex)]);
const double t_km2 = (fh[idx_fh_F_ord4(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord4(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord4(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord4(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+2,ex)]);
fxz[p] = Fdxdz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
{
const double t_km2 = (fh[idx_fh_F_ord5(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F_ord5(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F_ord5(iF,jF+1,kF-2,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord5(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F_ord5(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F_ord5(iF,jF+1,kF-1,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord5(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F_ord5(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F_ord5(iF,jF+1,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord5(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F_ord5(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F_ord5(iF,jF+1,kF+2,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF+2,ex)]);
const double t_km2 = (fh[idx_fh_F_ord4(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord4(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord4(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord4(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+2,ex)]);
fyz[p] = Fdydz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
}
@@ -787,15 +787,15 @@ void fdderivs(const int ex[3],
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Xdxdx * (
TWO * fh[idx_fh_F_ord5(iF-3,jF,kF,ex)] - F27*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + F270*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] - F490*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)] - F27*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + TWO*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
TWO * fh[idx_fh_F_ord4(iF-3,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
fyy[p] = Xdydy * (
TWO * fh[idx_fh_F_ord5(iF,jF-3,kF,ex)] - F27*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)] + F270*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)] - F490*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)] - F27*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)] + TWO*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
TWO * fh[idx_fh_F_ord4(iF,jF-3,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
fzz[p] = Xdzdz * (
TWO * fh[idx_fh_F_ord5(iF,jF,kF-3,ex)] - F27*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)] + F270*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)] - F490*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)] - F27*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)] + TWO*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
TWO * fh[idx_fh_F_ord4(iF,jF,kF-3,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
{
#define XSTEN6_8(JF, KF_DUMMY) \
(-fh[idx_fh_F_ord5(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord5(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord5(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord5(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord5(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F_ord5(iF+3,JF,KF_DUMMY,ex)])
(-fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)])
fxy[p] = Xdxdy * (
-XSTEN6_8(jF-3,kF) + F9*XSTEN6_8(jF-2,kF) - F45*XSTEN6_8(jF-1,kF) + F45*XSTEN6_8(jF+1,kF) - F9*XSTEN6_8(jF+2,kF) + XSTEN6_8(jF+3,kF));
fxz[p] = Xdxdz * (
@@ -804,7 +804,7 @@ void fdderivs(const int ex[3],
}
{
#define YSTEN6_8(JF, KF_DUMMY) \
(-fh[idx_fh_F_ord5(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord5(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord5(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord5(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord5(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F_ord5(iF,JF+3,KF_DUMMY,ex)])
(-fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)])
fyz[p] = Xdydz * (
-YSTEN6_8(jF,kF-3) + F9*YSTEN6_8(jF,kF-2) - F45*YSTEN6_8(jF,kF-1) + F45*YSTEN6_8(jF,kF+1) - F9*YSTEN6_8(jF,kF+2) + YSTEN6_8(jF,kF+3));
#undef YSTEN6_8
@@ -826,44 +826,44 @@ void fdderivs(const int ex[3],
/* Diagonal: [-9,+128,-1008,+8064,-14350,+8064,-1008,+128,-9] / (5040*dx^2) */
fxx[p] = Edxdx * (
-(double)9 * fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] -
F14350* fh[idx_fh_F_ord5(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)] -
(double)9 * fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]);
-(double)9 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]);
fyy[p] = Edydy * (
-(double)9 * fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] +
F128 * fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] -
F1008 * fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] +
F8064 * fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] -
F14350* fh[idx_fh_F_ord5(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)] -
F1008 * fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)] +
F128 * fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)] -
(double)9 * fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]);
-(double)9 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]);
fzz[p] = Edzdz * (
-(double)9 * fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] +
F128 * fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] -
F1008 * fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] +
F8064 * fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] -
F14350* fh[idx_fh_F_ord5(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)] -
F1008 * fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)] +
F128 * fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)] -
(double)9 * fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]);
-(double)9 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]);
/* Mixed: 9x9 outer product.
x-stencil: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4)
y/z weights: same [+3,-32,+168,-672,+672,-168,+32,-3] / 705600 */
{
#define XSTEN8(JF, KF_DUMMY) \
(+(double)3*fh[idx_fh_F_ord5(iF-4,JF,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord5(iF-3,JF,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord5(iF-2,JF,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord5(iF-1,JF,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord5(iF+1,JF,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord5(iF+2,JF,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord5(iF+3,JF,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord5(iF+4,JF,KF_DUMMY,ex)])
(+(double)3*fh[idx_fh_F_ord4(iF-4,JF,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF+4,JF,KF_DUMMY,ex)])
fxy[p] = Edxdy * (
+(double)3*XSTEN8(jF-4,kF) - F32*XSTEN8(jF-3,kF) + F168*XSTEN8(jF-2,kF) - F672*XSTEN8(jF-1,kF) + F672*XSTEN8(jF+1,kF) - F168*XSTEN8(jF+2,kF) + F32*XSTEN8(jF+3,kF) - (double)3*XSTEN8(jF+4,kF));
@@ -875,7 +875,7 @@ void fdderivs(const int ex[3],
}
{
#define YSTEN8(JF, KF_DUMMY) \
(+(double)3*fh[idx_fh_F_ord5(iF,JF-4,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord5(iF,JF-3,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord5(iF,JF-2,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord5(iF,JF-1,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord5(iF,JF+1,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord5(iF,JF+2,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord5(iF,JF+3,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord5(iF,JF+4,KF_DUMMY,ex)])
(+(double)3*fh[idx_fh_F_ord4(iF,JF-4,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF,JF+4,KF_DUMMY,ex)])
fyz[p] = Edydz * (
+(double)3*YSTEN8(jF,kF-4) - F32*YSTEN8(jF,kF-3) + F168*YSTEN8(jF,kF-2) - F672*YSTEN8(jF,kF-1) + F672*YSTEN8(jF,kF+1) - F168*YSTEN8(jF,kF+2) + F32*YSTEN8(jF,kF+3) - (double)3*YSTEN8(jF,kF+4));

View File

@@ -163,7 +163,7 @@ extern "C" void fdderivs_sh_(const int ex[3],
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
@@ -248,7 +248,7 @@ extern "C" void fdderivs_sh_(const int ex[3],
/* 2nd-order pass */
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
@@ -260,7 +260,7 @@ extern "C" void fdderivs_sh_(const int ex[3],
/* 4th-order pass */
if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){bool in6=has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi;if(in6)continue;
for(int i0=i4_lo;i0<=i4_hi;++i0){_Bool in6=has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi;if(in6)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));

View File

@@ -398,7 +398,7 @@ void fderivs(const int ex[3],
#elif (ghost_width == 5)
/* ---- 8th-order ----------------------------------------------------- */
{
const int ord = 5;
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
@@ -484,14 +484,14 @@ void fderivs(const int ex[3],
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d2dx * (
-fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)]);
-fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]);
fy[p] = d2dy * (
-fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)]);
-fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]);
fz[p] = d2dz * (
-fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)]);
-fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]);
}
}
}
@@ -507,22 +507,22 @@ void fderivs(const int ex[3],
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d12dx * (
fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)]);
fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]);
fy[p] = d12dy * (
fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)]);
fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]);
fz[p] = d12dz * (
fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)]);
fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]);
}
}
}
@@ -538,28 +538,28 @@ void fderivs(const int ex[3],
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d60dx * (
-fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] +
F9 * fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] -
F45 * fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] +
F45 * fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)] -
F9 * fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)] +
fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)]);
-fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] +
F9 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
F45 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
F45 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F9 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]);
fy[p] = d60dy * (
-fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] +
F9 * fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] -
F45 * fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] +
F45 * fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)] -
F9 * fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)] +
fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)]);
-fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] +
F9 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
F45 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
F45 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F9 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]);
fz[p] = d60dz * (
-fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] +
F9 * fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] -
F45 * fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] +
F45 * fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)] -
F9 * fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)] +
fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)]);
-fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] +
F9 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
F45 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
F45 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F9 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]);
}
}
}
@@ -576,34 +576,34 @@ void fderivs(const int ex[3],
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d840dx * (
+(double)3 * fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] -
F32 * fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] +
F168 * fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] -
F672 * fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] +
F672 * fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)] -
F168 * fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)] +
F32 * fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)] -
(double)3 * fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]);
+(double)3 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] -
F32 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] +
F168 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
F672 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
F672 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F168 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
F32 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]);
fy[p] = d840dy * (
+(double)3 * fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] -
F32 * fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] +
F168 * fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] -
F672 * fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] +
F672 * fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)] -
F168 * fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)] +
F32 * fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)] -
(double)3 * fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]);
+(double)3 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] -
F32 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] +
F168 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
F672 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
F672 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F168 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
F32 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]);
fz[p] = d840dz * (
+(double)3 * fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] -
F32 * fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] +
F168 * fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] -
F672 * fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] +
F672 * fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)] -
F168 * fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)] +
F32 * fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)] -
(double)3 * fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]);
+(double)3 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] -
F32 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] +
F168 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
F672 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
F672 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F168 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
F32 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]);
}
}
}

View File

@@ -312,8 +312,8 @@
end function decide3d
!---------------------------------------------------------------------------------------
subroutine symmetry_bd(ord,extc,func,funcc,SoA)
implicit none
subroutine symmetry_bd(ord,extc,func,funcc,SoA)
implicit none
!~~~~~~> input arguments
integer,intent(in) :: ord
@@ -322,9 +322,12 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA
integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
enddo
@@ -337,8 +340,8 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
end subroutine symmetry_bd
subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
implicit none
subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
implicit none
!~~~~~~> input arguments
integer,intent(in) :: ord
@@ -347,9 +350,12 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,-ord+1:extc(3)+ord),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA
integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -365,8 +371,8 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
end subroutine symmetry_tbd
subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
implicit none
subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
implicit none
!~~~~~~> input arguments
integer,intent(in) :: ord
@@ -375,9 +381,12 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,extc(3)),intent(out):: funcc
real*8, dimension(2), intent(in) :: SoA
integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -1119,6 +1128,10 @@ end subroutine d2dump
#define POLINT6_USE_BARYCENTRIC 1
#endif
#ifndef USE_FMISC_SAFE_MODE
#define USE_FMISC_SAFE_MODE 0
#endif
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
subroutine polint6_neville(xa, ya, x, y, dy)
implicit none
@@ -1271,7 +1284,9 @@ end subroutine d2dump
real*8 :: dif, dift, hp, h, den_val
if (ordn == 6) then
#if POLINT6_USE_BARYCENTRIC
#if USE_FMISC_SAFE_MODE
call polint6_neville(xa, ya, x, y, dy)
#elif POLINT6_USE_BARYCENTRIC
call polint6_barycentric(xa, ya, x, y, dy)
#else
call polint6_neville(xa, ya, x, y, dy)
@@ -1376,10 +1391,10 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
integer :: i,m
real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp
#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
integer :: i,m
real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp
m=size(x1a)
do i=1,m
@@ -1414,7 +1429,7 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
@@ -1502,12 +1517,23 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ
@@ -1565,7 +1591,9 @@ if(dabs(Z(1)-zmin) < dZ) kmin = 1
do k=kmin,kmax
do j=jmin,jmax
#if !USE_FMISC_SAFE_MODE
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
#endif
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
@@ -1672,12 +1700,23 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ
@@ -1769,12 +1808,23 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif
! Optimized with oneMKL BLAS DDOT for dot product
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
return
@@ -1878,13 +1928,23 @@ deallocate(f_flat)
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
integer :: i,j,k
#if USE_FMISC_SAFE_MODE
do k=1,ext(3)
do j=1,ext(2)
do i=1,ext(1)
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
enddo
enddo
enddo
#else
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
end do
#endif
return
end subroutine average2
end subroutine average2
!-----------------------------------------------------------------------------
subroutine average2p(ext,f1,f2,f3,fout)
implicit none
@@ -2024,8 +2084,15 @@ deallocate(f_flat)
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo
! Third dimension: x-direction weighted sum using BLAS DDOT
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Third dimension: x-direction weighted sum using BLAS DDOT
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return
@@ -2091,8 +2158,15 @@ deallocate(f_flat)
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo
! Use BLAS DDOT for final weighted sum
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Use BLAS DDOT for final weighted sum
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return
@@ -2184,8 +2258,15 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif
! Optimized with BLAS DDOT for weighted sum
f_int = DDOT(ORDN, coef, 1, ya, 1)
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * ya(m)
end do
#else
! Optimized with BLAS DDOT for weighted sum
f_int = DDOT(ORDN, coef, 1, ya, 1)
#endif
return

View File

@@ -18,7 +18,7 @@ using namespace std;
#endif
// Intel oneMKL LAPACK interface
#include <lapacke.h>
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is

View File

@@ -349,29 +349,29 @@ void lopsided(const int ex[3],
f_rhs[p] += sfx * d2dx * (
-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-3)>=iminF && i0<=ex1-3) // i-4>=imin && i+2<=imax
if ((i0-4)>=iminF && i0<=ex1-2) // i-4>=imin && i+2<=imax
f_rhs[p] -= sfx * d60dx * (
+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]
-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
else if ((i0-4)>=iminF && i0<=ex1-2) // i-5>=imin && i+1<=imax
else if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+1<=imax
f_rhs[p] -= sfx * d60dx * (
-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]
+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]
+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
else if ((i0-2)>=iminF && i0<=ex1-4) // 6th centered
f_rhs[p] += sfx * d60dx * (
else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
f_rhs[p] -= sfx * d60dx * (
-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if ((i0-1)>=iminF && i0<=ex1-3) // 4th
f_rhs[p] += sfx * d12dx * (
else if ((i0-2)>=iminF && i0<=ex1-2) // 4th
f_rhs[p] -= sfx * d12dx * (
fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if (i0>=iminF && i0<=ex1-2) // 2nd
f_rhs[p] += sfx * d2dx * (
else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd
f_rhs[p] -= sfx * d2dx * (
-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
}
@@ -389,16 +389,16 @@ void lopsided(const int ex[3],
else if (j0<=ex2-2 && j0>=jminF)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-3)>=jminF && j0<=ex2-3)
if ((j0-4)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
else if ((j0-4)>=jminF && j0<=ex2-2)
else if ((j0-5)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-4)
f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-3)
f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if (j0>=jminF && j0<=ex2-2)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
else if ((j0-3)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
}
/* ---- z-direction ---- */
@@ -415,16 +415,16 @@ void lopsided(const int ex[3],
else if (k0<=ex3-2 && k0>=kminF)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
} else if (sfz < ZEO) {
if ((k0-3)>=kminF && k0<=ex3-3)
if ((k0-4)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
else if ((k0-4)>=kminF && k0<=ex3-2)
else if ((k0-5)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-4)
f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-3)
f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if (k0>=kminF && k0<=ex3-2)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
else if ((k0-3)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
}
}
}
@@ -503,30 +503,30 @@ void lopsided(const int ex[3],
f_rhs[p] += sfx * d2dx * (
-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-4)>=iminF && i0<=ex1-4)
if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+3<=imax
f_rhs[p] -= sfx * d840dx * (
-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]
+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]
+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
else if ((i0-3)>=iminF && i0<=ex1-5) // 8th centered
f_rhs[p] += sfx * d840dx * (
else if ((i0-4)>=iminF && i0<=ex1-2) // 8th centered
f_rhs[p] -= sfx * d840dx * (
+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]
+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if ((i0-2)>=iminF && i0<=ex1-4) // 6th centered
f_rhs[p] += sfx * d60dx * (
else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
f_rhs[p] -= sfx * d60dx * (
-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if ((i0-1)>=iminF && i0<=ex1-3) // 4th centered
f_rhs[p] += sfx * d12dx * (
else if ((i0-2)>=iminF && i0<=ex1-2) // 4th centered
f_rhs[p] -= sfx * d12dx * (
fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if (i0>=iminF && i0<=ex1-2) // 2nd centered
f_rhs[p] += sfx * d2dx * (
else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd centered
f_rhs[p] -= sfx * d2dx * (
-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
}
@@ -543,16 +543,16 @@ void lopsided(const int ex[3],
else if (j0<=ex2-2 && j0>=jminF)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-4)>=jminF && j0<=ex2-4)
if ((j0-5)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
else if ((j0-3)>=jminF && j0<=ex2-5)
f_rhs[p] += sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-4)
f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-3)
f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if (j0>=jminF && j0<=ex2-2)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
else if ((j0-4)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if ((j0-3)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
}
const double sfz = Sfz[p];
@@ -568,16 +568,16 @@ void lopsided(const int ex[3],
else if (k0<=ex3-2 && k0>=kminF)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
} else if (sfz < ZEO) {
if ((k0-4)>=kminF && k0<=ex3-4)
if ((k0-5)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
else if ((k0-3)>=kminF && k0<=ex3-5)
f_rhs[p] += sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-4)
f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-3)
f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if (k0>=kminF && k0<=ex3-2)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
else if ((k0-4)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if ((k0-3)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
}
}
}

View File

@@ -3,9 +3,7 @@
/*
* C 版 lopsided_kodis — combined upwind advection + KO dissipation.
* Uses one shared symmetry_bd buffer (ord = ghost_width for both components)
* where a stable merged stencil is available. The 8th-order path delegates to
* the separate lopsided + kodis kernels, matching the original Fortran flow.
* Uses one shared symmetry_bd buffer (ord = ghost_width for both components).
*
* FD order selection via ghost_width:
* 2 → 2nd-order advection + r=2 KO (cof=16, sign=-)
@@ -233,11 +231,11 @@ void lopsided_kodis(const int ex[3],
else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
} else if (sfx<ZEO) {
if ((i0-3)>=iminF&&i0<=ex1-3) f_rhs[p]-=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
else if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
else if ((i0-2)>=iminF&&i0<=ex1-4) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if ((i0-1)>=iminF&&i0<=ex1-3) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if (i0>=iminF&&i0<=ex1-2) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
else if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
}
/* y */
const double sfy=Sfy[p];
@@ -248,11 +246,11 @@ void lopsided_kodis(const int ex[3],
else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
} else if (sfy<ZEO) {
if ((j0-3)>=jminF&&j0<=ex2-3) f_rhs[p]-=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
else if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
else if ((j0-2)>=jminF&&j0<=ex2-4) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-1)>=jminF&&j0<=ex2-3) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if (j0>=jminF&&j0<=ex2-2) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
else if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
}
/* z */
const double sfz=Sfz[p];
@@ -263,11 +261,11 @@ void lopsided_kodis(const int ex[3],
else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
} else if (sfz<ZEO) {
if ((k0-3)>=kminF&&k0<=ex3-3) f_rhs[p]-=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
else if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
else if ((k0-2)>=kminF&&k0<=ex3-4) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-1)>=kminF&&k0<=ex3-3) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if (k0>=kminF&&k0<=ex3-2) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
else if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
}
}}}
@@ -294,8 +292,92 @@ void lopsided_kodis(const int ex[3],
}
#elif (ghost_width == 5)
{
lopsided(ex, X, Y, Z, f, f_rhs, Sfx, Sfy, Sfz, Symmetry, SoA);
if (eps > ZEO) kodis(ex, X, Y, Z, f, f_rhs, SoA, Symmetry, eps);
const int ord = 5;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d840dx=ONE/F840/dX, d840dy=ONE/F840/dY, d840dz=ONE/F840/dZ;
const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ;
const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
/* ---- advection (8th-order lopsided) ---- */
for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+1;
for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+1;
for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double sfx=Sfx[p];
if (sfx>ZEO) {
if (i0<=ex1-6&&(i0-2)>=iminF) f_rhs[p]+=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]);
else if (i0<=ex1-5&&(i0-3)>=iminF) f_rhs[p]+=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
} else if (sfx<ZEO) {
if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
else if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
}
const double sfy=Sfy[p];
if (sfy>ZEO) {
if (j0<=ex2-6&&(j0-2)>=jminF) f_rhs[p]+=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]);
else if (j0<=ex2-5&&(j0-3)>=jminF) f_rhs[p]+=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
} else if (sfy<ZEO) {
if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
else if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
}
const double sfz=Sfz[p];
if (sfz>ZEO) {
if (k0<=ex3-6&&(k0-2)>=kminF) f_rhs[p]+=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]);
else if (k0<=ex3-5&&(k0-3)>=kminF) f_rhs[p]+=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
} else if (sfz<ZEO) {
if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
else if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
}
}}}
/* ---- KO dissipation (r=5, cof=1024, sign=+) ---- */
if (eps > ZEO) {
const double cof = 1024.0;
const double F10k=10.0, F45k=45.0, F120=120.0, F210=210.0, F252=252.0;
const int i0_lo=(iminF+4>0)?iminF+4:0, j0_lo=(jminF+4>0)?jminF+4:0, k0_lo=(kminF+4>0)?kminF+4:0;
const int i0_hi=imaxF-6, j0_hi=jmaxF-6, k0_hi=kmaxF-6;
if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+5,jF,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+4,jF,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+2,jF,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dX;
const double Dy=((fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+5,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+4,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+2,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dY;
const double Dz=((fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+5,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+4,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+2,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dZ;
f_rhs[p] += (eps/cof)*(Dx+Dy+Dz);
}}}
}
}
free(fh);
return;
}
#else

View File

@@ -55,17 +55,38 @@ EM_KERNEL_FLAG = -DBSSN_USE_EM_C_KERNEL=$(EFFECTIVE_USE_CXX_EM_KERNEL)
## 0 : fallback to Neville path
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
FMISC_SAFE_FLAG = -DUSE_FMISC_SAFE_MODE=$(USE_FMISC_SAFE_MODE)
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
## GCC build flags (optimized for x86-64-v4)
## PGO disabled (used negative optimization on Intel; not tested on GCC)
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
-cpp $(POLINT6_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
endif
.SUFFIXES: .o .f90 .C .for .cu
@@ -131,9 +152,9 @@ z4c_rhs_c.o: z4c_rhs_c.C
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
TP_OPTFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(filein_real)
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@

View File

@@ -1,25 +1,33 @@
## GCC version with OpenMPI and OpenBLAS
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
## OpenBLAS (OpenMP variant) + gfortran runtime
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
# OpenMP flag for selective compilation
OMP_FLAG = -fopenmp
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
## Memory allocator switch
## 0 (default) : use system default allocator (ptmalloc)
## 1 : use jemalloc (install jemalloc-devel first)
USE_JEMALLOC ?= 0
ifeq ($(USE_JEMALLOC),1)
LDLIBS := -ljemalloc $(LDLIBS)
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
PGO_MODE ?= opt
## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation
@@ -36,20 +44,20 @@ INTERP_LB_FLAGS =
endif
## Kernel implementation switch
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 : fall back to original Fortran kernels
USE_CXX_KERNELS ?= 1
## 1 : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 (default): fall back to original Fortran kernels
USE_CXX_KERNELS ?= 0
## Z4C Cartesian RHS kernel switch
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
## 0 : use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 1
## 1 : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
## 0 (default): use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 0
## BSSN-EScalar RHS switch
## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path
## 1 : use BSSN-EScalar C wrapper on the normal patch path
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 1
USE_CXX_ESCALAR_KERNEL ?= 0
## BSSN-EM RHS switch
## 1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path
@@ -64,16 +72,22 @@ USE_CXX_EM_KERNEL ?= 0
USE_TRANSFER_CACHE ?= auto
## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
## 1 : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 (default): use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 0
f90 = gfortran
f77 = gfortran
CXX = g++
CC = gcc
CLINKER = mpicxx
## fmisc conservative mode switch
## 1 : restore lower-optimization / legacy fmisc numerics
## 0 (default): keep the optimized fmisc paths
USE_FMISC_SAFE_MODE ?= 0
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc