Compare commits

..

5 Commits

Author SHA1 Message Date
ed89bc029b Fix potential division by zero in reta_val calculation and enable NaN checks
Added a safety check for the denominator in the reta_val calculation to
prevent division by zero when chi approaches zero (e.g., at far-field
boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early.
Initialized output arrays to zero to prevent uninitialized memory access.
2026-01-19 20:29:48 +08:00
19274e93d1 Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs
Refactored calc_derivs and calc_dderivs to include correct boundary
handling logic matching the legacy code. Implemented fallback to 2nd
order derivatives when near boundaries where 4th order stencils cannot
be used. Added logic to initialize output arrays to zero to avoid
uninitialized memory access.
2026-01-19 20:03:22 +08:00
ae1a474cca Fix compilation errors and complete logic in BSSN RHS optimization 2026-01-19 19:22:52 +08:00
cbb8fb3a87 patched last commit 2026-01-19 17:14:28 +08:00
4472d89a9f Optimize bssn_rhs calculation with cache blocking and vectorization
- Implemented cache blocking (BLK=8) in bssn_rhs_opt.f90 to improve L1/L2 cache hit rate.
- Introduced bssn_rhs_opt.f90 module with vectorized derivative and physics kernels.
- Renamed original implementation to bssn_rhs_legacy.f90 for fallback.
- Updated bssn_rhs.f90 to act as a dispatcher, using the optimized path for ghost_width=3.
- Updated makefile to include new source files.
- Added DEBUG_NAN_CHECK macro to optionally disable NaN checks in production.
2026-01-19 16:39:24 +08:00
14 changed files with 2716 additions and 2044 deletions

View File

@@ -16,7 +16,7 @@ import numpy
File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long
MPI_processes = 64 ## number of mpi processes used in the simulation
MPI_processes = 48 ## number of mpi processes used in the simulation
GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)

View File

@@ -5,7 +5,6 @@
#include <cstdio>
#include <cstdlib>
#include <string>
#include <cstring>
#include <iostream>
#include <iomanip>
#include <fstream>
@@ -61,110 +60,13 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b,
F = dvector(0, ntotal - 1);
allocate_derivs(&u, ntotal);
allocate_derivs(&v, ntotal);
// Allocate workspace buffers for hot-path allocation elimination
int N = maximum3(n1, n2, n3);
int maxn = maximum2(n1, n2);
// LineRelax_be workspace (sized for n2)
ws_diag_be = new double[n2];
ws_e_be = new double[n2 - 1];
ws_f_be = new double[n2 - 1];
ws_b_be = new double[n2];
ws_x_be = new double[n2];
// LineRelax_al workspace (sized for n1)
ws_diag_al = new double[n1];
ws_e_al = new double[n1 - 1];
ws_f_al = new double[n1 - 1];
ws_b_al = new double[n1];
ws_x_al = new double[n1];
// ThomasAlgorithm workspace (sized for max(n1,n2))
ws_thomas_y = new double[maxn];
// JFD_times_dv workspace (sized for nvar)
ws_jfd_values = dvector(0, nvar - 1);
allocate_derivs(&ws_jfd_dU, nvar);
allocate_derivs(&ws_jfd_U, nvar);
// chebft_Zeros workspace (sized for N+1)
ws_cheb_c = dvector(0, N);
// fourft workspace (sized for N/2+1 each)
ws_four_a = dvector(0, N / 2);
ws_four_b = dvector(0, N / 2);
// Derivatives_AB3 workspace
ws_deriv_p = dvector(0, N);
ws_deriv_dp = dvector(0, N);
ws_deriv_d2p = dvector(0, N);
ws_deriv_q = dvector(0, N);
ws_deriv_dq = dvector(0, N);
ws_deriv_r = dvector(0, N);
ws_deriv_dr = dvector(0, N);
ws_deriv_indx = ivector(0, N);
// F_of_v workspace
ws_fov_sources = new double[n1 * n2 * n3];
ws_fov_values = dvector(0, nvar - 1);
allocate_derivs(&ws_fov_U, nvar);
// J_times_dv workspace
ws_jtdv_values = dvector(0, nvar - 1);
allocate_derivs(&ws_jtdv_dU, nvar);
allocate_derivs(&ws_jtdv_U, nvar);
}
TwoPunctures::~TwoPunctures()
{
int const nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
int N = maximum3(n1, n2, n3);
free_dvector(F, 0, ntotal - 1);
free_derivs(&u, ntotal);
free_derivs(&v, ntotal);
// Free workspace buffers
delete[] ws_diag_be;
delete[] ws_e_be;
delete[] ws_f_be;
delete[] ws_b_be;
delete[] ws_x_be;
delete[] ws_diag_al;
delete[] ws_e_al;
delete[] ws_f_al;
delete[] ws_b_al;
delete[] ws_x_al;
delete[] ws_thomas_y;
free_dvector(ws_jfd_values, 0, nvar - 1);
free_derivs(&ws_jfd_dU, nvar);
free_derivs(&ws_jfd_U, nvar);
free_dvector(ws_cheb_c, 0, N);
free_dvector(ws_four_a, 0, N / 2);
free_dvector(ws_four_b, 0, N / 2);
free_dvector(ws_deriv_p, 0, N);
free_dvector(ws_deriv_dp, 0, N);
free_dvector(ws_deriv_d2p, 0, N);
free_dvector(ws_deriv_q, 0, N);
free_dvector(ws_deriv_dq, 0, N);
free_dvector(ws_deriv_r, 0, N);
free_dvector(ws_deriv_dr, 0, N);
free_ivector(ws_deriv_indx, 0, N);
delete[] ws_fov_sources;
free_dvector(ws_fov_values, 0, nvar - 1);
free_derivs(&ws_fov_U, nvar);
free_dvector(ws_jtdv_values, 0, nvar - 1);
free_derivs(&ws_jtdv_dU, nvar);
free_derivs(&ws_jtdv_U, nvar);
}
void TwoPunctures::Solve()
@@ -753,7 +655,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
int k, j, isignum;
double fac, sum, Pion, *c;
c = ws_cheb_c;
c = dvector(0, n);
Pion = Pi / n;
if (inv == 0)
{
@@ -784,6 +686,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
}
for (j = 0; j < n; j++)
u[j] = c[j];
free_dvector(c, 0, n);
}
/* --------------------------------------------------------------------------*/
@@ -871,8 +774,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
double x, x1, fac, Pi_fac, *a, *b;
M = N / 2;
a = ws_four_a;
b = ws_four_b - 1; /* offset to match dvector(1,M) indexing */
a = dvector(0, M);
b = dvector(1, M); /* Actually: b=vector(1,M-1) but this is problematic if M=1*/
fac = 1. / M;
Pi_fac = Pi * fac;
if (inv == 0)
@@ -921,6 +824,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
iy = -iy;
}
}
free_dvector(a, 0, M);
free_dvector(b, 1, M);
}
/* -----------------------------------------*/
@@ -1213,14 +1118,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
double *p, *dp, *d2p, *q, *dq, *r, *dr;
N = maximum3(n1, n2, n3);
p = ws_deriv_p;
dp = ws_deriv_dp;
d2p = ws_deriv_d2p;
q = ws_deriv_q;
dq = ws_deriv_dq;
r = ws_deriv_r;
dr = ws_deriv_dr;
indx = ws_deriv_indx;
p = dvector(0, N);
dp = dvector(0, N);
d2p = dvector(0, N);
q = dvector(0, N);
dq = dvector(0, N);
r = dvector(0, N);
dr = dvector(0, N);
indx = ivector(0, N);
for (ivar = 0; ivar < nvar; ivar++)
{
@@ -1303,6 +1208,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
}
}
}
free_dvector(p, 0, N);
free_dvector(dp, 0, N);
free_dvector(d2p, 0, N);
free_dvector(q, 0, N);
free_dvector(dq, 0, N);
free_dvector(r, 0, N);
free_dvector(dr, 0, N);
free_ivector(indx, 0, N);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::Newton(int const nvar, int const n1, int const n2, int const n3,
@@ -1371,11 +1284,10 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
derivs U;
double *sources;
values = ws_fov_values;
U = ws_fov_U;
values = dvector(0, nvar - 1);
allocate_derivs(&U, nvar);
sources = ws_fov_sources;
memset(sources, 0, n1 * n2 * n3 * sizeof(double));
sources = (double *)calloc(n1 * n2 * n3, sizeof(double));
if (0)
{
double *s_x, *s_y, *s_z;
@@ -1530,6 +1442,9 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
{
fclose(debugfile);
}
free(sources);
free_dvector(values, 0, nvar - 1);
free_derivs(&U, nvar);
}
/* --------------------------------------------------------------------------*/
double TwoPunctures::norm_inf(double const *F, int const ntotal)
@@ -1935,12 +1850,11 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
Derivatives_AB3(nvar, n1, n2, n3, dv);
values = ws_jtdv_values;
dU = ws_jtdv_dU;
U = ws_jtdv_U;
for (i = 0; i < n1; i++)
{
values = dvector(0, nvar - 1);
allocate_derivs(&dU, nvar);
allocate_derivs(&U, nvar);
for (j = 0; j < n2; j++)
{
for (k = 0; k < n3; k++)
@@ -1994,6 +1908,9 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
}
}
}
free_dvector(values, 0, nvar - 1);
free_derivs(&dU, nvar);
free_derivs(&U, nvar);
}
}
/* --------------------------------------------------------------------------*/
@@ -2040,11 +1957,17 @@ void TwoPunctures::LineRelax_be(double *dv,
{
int j, m, Ic, Ip, Im, col, ivar;
double *diag = ws_diag_be;
double *e = ws_e_be; /* above diagonal */
double *f = ws_f_be; /* below diagonal */
double *b = ws_b_be; /* rhs */
double *x = ws_x_be; /* solution vector */
double *diag = new double[n2];
double *e = new double[n2 - 1]; /* above diagonal */
double *f = new double[n2 - 1]; /* below diagonal */
double *b = new double[n2]; /* rhs */
double *x = new double[n2]; /* solution vector */
// gsl_vector *diag = gsl_vector_alloc(n2);
// gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */
// gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */
// gsl_vector *b = gsl_vector_alloc(n2); /* rhs */
// gsl_vector *x = gsl_vector_alloc(n2); /* solution vector */
for (ivar = 0; ivar < nvar; ivar++)
{
@@ -2054,35 +1977,62 @@ void TwoPunctures::LineRelax_be(double *dv,
}
diag[n2 - 1] = 0;
// gsl_vector_set_zero(diag);
// gsl_vector_set_zero(e);
// gsl_vector_set_zero(f);
for (j = 0; j < n2; j++)
{
Ip = Index(ivar, i, j + 1, k, nvar, n1, n2, n3);
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
Im = Index(ivar, i, j - 1, k, nvar, n1, n2, n3);
b[j] = rhs[Ic];
// gsl_vector_set(b,j,rhs[Ic]);
for (m = 0; m < ncols[Ic]; m++)
{
col = cols[Ic][m];
if (col != Ip && col != Ic && col != Im)
b[j] -= JFD[Ic][m] * dv[col];
// *gsl_vector_ptr(b, j) -= JFD[Ic][m] * dv[col];
else
{
if (col == Im && j > 0)
f[j - 1] = JFD[Ic][m];
// gsl_vector_set(f,j-1,JFD[Ic][m]);
if (col == Ic)
diag[j] = JFD[Ic][m];
// gsl_vector_set(diag,j,JFD[Ic][m]);
if (col == Ip && j < n2 - 1)
e[j] = JFD[Ic][m];
// gsl_vector_set(e,j,JFD[Ic][m]);
}
}
}
// A x = b
// A = ( d_0 e_0 0 0 )
// ( f_0 d_1 e_1 0 )
// ( 0 f_1 d_2 e_2 )
// ( 0 0 f_2 d_3 )
//
ThomasAlgorithm(n2, f, diag, e, x, b);
// gsl_linalg_solve_tridiag(diag, e, f, b, x);
for (j = 0; j < n2; j++)
{
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
dv[Ic] = x[j];
// dv[Ic] = gsl_vector_get(x, j);
}
}
delete[] diag;
delete[] e;
delete[] f;
delete[] b;
delete[] x;
// gsl_vector_free(diag);
// gsl_vector_free(e);
// gsl_vector_free(f);
// gsl_vector_free(b);
// gsl_vector_free(x);
}
/* --------------------------------------------------------------------------*/
void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
@@ -2099,8 +2049,8 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp;
derivs dU, U;
dU = ws_jfd_dU;
U = ws_jfd_U;
allocate_derivs(&dU, nvar);
allocate_derivs(&U, nvar);
if (k < 0)
k = k + n3;
@@ -2218,6 +2168,9 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
for (ivar = 0; ivar < nvar; ivar++)
values[ivar] *= FAC;
free_derivs(&dU, nvar);
free_derivs(&U, nvar);
}
#undef FAC
/*-----------------------------------------------------------*/
@@ -2249,11 +2202,17 @@ void TwoPunctures::LineRelax_al(double *dv,
{
int i, m, Ic, Ip, Im, col, ivar;
double *diag = ws_diag_al;
double *e = ws_e_al; /* above diagonal */
double *f = ws_f_al; /* below diagonal */
double *b = ws_b_al; /* rhs */
double *x = ws_x_al; /* solution vector */
double *diag = new double[n1];
double *e = new double[n1 - 1]; /* above diagonal */
double *f = new double[n1 - 1]; /* below diagonal */
double *b = new double[n1]; /* rhs */
double *x = new double[n1]; /* solution vector */
// gsl_vector *diag = gsl_vector_alloc(n1);
// gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */
// gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */
// gsl_vector *b = gsl_vector_alloc(n1); /* rhs */
// gsl_vector *x = gsl_vector_alloc(n1); /* solution vector */
for (ivar = 0; ivar < nvar; ivar++)
{
@@ -2263,35 +2222,57 @@ void TwoPunctures::LineRelax_al(double *dv,
}
diag[n1 - 1] = 0;
// gsl_vector_set_zero(diag);
// gsl_vector_set_zero(e);
// gsl_vector_set_zero(f);
for (i = 0; i < n1; i++)
{
Ip = Index(ivar, i + 1, j, k, nvar, n1, n2, n3);
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
Im = Index(ivar, i - 1, j, k, nvar, n1, n2, n3);
b[i] = rhs[Ic];
// gsl_vector_set(b,i,rhs[Ic]);
for (m = 0; m < ncols[Ic]; m++)
{
col = cols[Ic][m];
if (col != Ip && col != Ic && col != Im)
b[i] -= JFD[Ic][m] * dv[col];
// *gsl_vector_ptr(b, i) -= JFD[Ic][m] * dv[col];
else
{
if (col == Im && i > 0)
f[i - 1] = JFD[Ic][m];
// gsl_vector_set(f,i-1,JFD[Ic][m]);
if (col == Ic)
diag[i] = JFD[Ic][m];
// gsl_vector_set(diag,i,JFD[Ic][m]);
if (col == Ip && i < n1 - 1)
e[i] = JFD[Ic][m];
// gsl_vector_set(e,i,JFD[Ic][m]);
}
}
}
ThomasAlgorithm(n1, f, diag, e, x, b);
// gsl_linalg_solve_tridiag(diag, e, f, b, x);
for (i = 0; i < n1; i++)
{
Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
dv[Ic] = x[i];
// dv[Ic] = gsl_vector_get(x, i);
}
}
delete[] diag;
delete[] e;
delete[] f;
delete[] b;
delete[] x;
// gsl_vector_free(diag);
// gsl_vector_free(e);
// gsl_vector_free(f);
// gsl_vector_free(b);
// gsl_vector_free(x);
}
/* -------------------------------------------------------------------------*/
// a[N], b[N-1], c[N-1], x[N], q[N]
@@ -2303,29 +2284,44 @@ void TwoPunctures::LineRelax_al(double *dv,
//"Parallel Scientific Computing in C++ and MPI" P361
void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q)
{
// In-place Thomas algorithm: uses a[] as d workspace, b[] as l workspace.
// c[] is already u (above-diagonal). ws_thomas_y is pre-allocated workspace.
int i;
double *y = ws_thomas_y;
double *l, *u, *d, *y;
l = new double[N - 1];
u = new double[N - 1];
d = new double[N];
y = new double[N];
/* LU Decomposition */
d[0] = a[0];
u[0] = c[0];
/* LU Decomposition (in-place: a becomes d, b becomes l) */
for (i = 0; i < N - 2; i++)
{
b[i] = b[i] / a[i];
a[i + 1] = a[i + 1] - b[i] * c[i];
l[i] = b[i] / d[i];
d[i + 1] = a[i + 1] - l[i] * u[i];
u[i + 1] = c[i + 1];
}
b[N - 2] = b[N - 2] / a[N - 2];
a[N - 1] = a[N - 1] - b[N - 2] * c[N - 2];
l[N - 2] = b[N - 2] / d[N - 2];
d[N - 1] = a[N - 1] - l[N - 2] * u[N - 2];
/* Forward Substitution [L][y] = [q] */
y[0] = q[0];
for (i = 1; i < N; i++)
y[i] = q[i] - b[i - 1] * y[i - 1];
y[i] = q[i] - l[i - 1] * y[i - 1];
/* Backward Substitution [U][x] = [y] */
x[N - 1] = y[N - 1] / a[N - 1];
x[N - 1] = y[N - 1] / d[N - 1];
for (i = N - 2; i >= 0; i--)
x[i] = (y[i] - c[i] * x[i + 1]) / a[i];
x[i] = (y[i] - u[i] * x[i + 1]) / d[i];
delete[] l;
delete[] u;
delete[] d;
delete[] y;
return;
}
// --------------------------------------------------------------------------*/
// Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/

View File

@@ -42,33 +42,6 @@ private:
int ntotal;
// Pre-allocated workspace buffers for hot-path allocation elimination
// LineRelax_be workspace (sized for n2)
double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be;
// LineRelax_al workspace (sized for n1)
double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
// ThomasAlgorithm workspace (sized for max(n1,n2))
double *ws_thomas_y;
// JFD_times_dv workspace (sized for nvar)
double *ws_jfd_values;
derivs ws_jfd_dU, ws_jfd_U;
// chebft_Zeros workspace (sized for max(n1,n2,n3)+1)
double *ws_cheb_c;
// fourft workspace (sized for max(n1,n2,n3)/2+1 each)
double *ws_four_a, *ws_four_b;
// Derivatives_AB3 workspace
double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
double *ws_deriv_q, *ws_deriv_dq;
double *ws_deriv_r, *ws_deriv_dr;
int *ws_deriv_indx;
// F_of_v workspace
double *ws_fov_sources;
double *ws_fov_values;
derivs ws_fov_U;
// J_times_dv workspace
double *ws_jtdv_values;
derivs ws_jtdv_dU, ws_jtdv_U;
struct parameters
{
int nvar, n1, n2, n3;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1103,103 +1103,6 @@
end subroutine fderivs
!-----------------------------------------------------------------------------
! fderivs variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine fderivs_fh(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3, &
symmetry,onoff,fh)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)), intent(in ):: f
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: fx,fy,fz
real*8, intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
real*8 :: dX,dY,dZ
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0
real*8, parameter :: TWO=2.d0,EIT=8.d0
real*8, parameter :: F12=1.2d1
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f,fh,SoA)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
fx = ZEO
fy = ZEO
fz = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
#if 0
if(i+2 <= imax .and. i-2 >= imin)then
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax .and. i-1 >= imin)then
fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
endif
if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax .and. j-1 >= jmin)then
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
endif
if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax .and. k-1 >= kmin)then
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif
#else
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif
#endif
enddo
enddo
enddo
return
end subroutine fderivs_fh
!-----------------------------------------------------------------------------
!
! single derivatives dx
!
@@ -2037,162 +1940,6 @@
end subroutine fddyz
!-----------------------------------------------------------------------------
! fdderivs variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine fdderivs_fh(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM1,SYM2,SYM3,symmetry,onoff,fh)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)),intent(in ):: f
real*8, dimension(ex(1),ex(2),ex(3)),intent(out):: fxx,fxy,fxz,fyy,fyz,fzz
real*8, intent(in ):: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
real*8 :: dX,dY,dZ
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1
real*8, parameter :: F8=8.d0, F16=1.6d1, F30=3.d1
real*8, parameter :: F1o12=ONE/1.2d1, F1o144=ONE/1.44d2
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f,fh,SoA)
Sdxdx = ONE /( dX * dX )
Sdydy = ONE /( dY * dY )
Sdzdz = ONE /( dZ * dZ )
Fdxdx = F1o12 /( dX * dX )
Fdydy = F1o12 /( dY * dY )
Fdzdz = F1o12 /( dZ * dZ )
Sdxdy = F1o4 /( dX * dY )
Sdxdz = F1o4 /( dX * dZ )
Sdydz = F1o4 /( dY * dZ )
Fdxdy = F1o144 /( dX * dY )
Fdxdz = F1o144 /( dX * dZ )
Fdydz = F1o144 /( dY * dZ )
fxx = ZEO
fyy = ZEO
fzz = ZEO
fxy = ZEO
fxz = ZEO
fyz = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
#if 0
if(i+2 <= imax .and. i-2 >= imin)then
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
elseif(i+1 <= imax .and. i-1 >= imin)then
fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
endif
if(j+2 <= jmax .and. j-2 >= jmin)then
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
elseif(j+1 <= jmax .and. j-1 >= jmin)then
fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
endif
if(k+2 <= kmax .and. k-2 >= kmin)then
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
elseif(k+1 <= kmax .and. k-1 >= kmin)then
fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
endif
if(i+2 <= imax .and. i-2 >= imin .and. j+2 <= jmax .and. j-2 >= jmin)then
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
elseif(i+1 <= imax .and. i-1 >= imin .and. j+1 <= jmax .and. j-1 >= jmin)then
fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
endif
if(i+2 <= imax .and. i-2 >= imin .and. k+2 <= kmax .and. k-2 >= kmin)then
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
elseif(i+1 <= imax .and. i-1 >= imin .and. k+1 <= kmax .and. k-1 >= kmin)then
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
endif
if(j+2 <= jmax .and. j-2 >= jmin .and. k+2 <= kmax .and. k-2 >= kmin)then
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif
#else
! for bam comparison
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif
#endif
enddo
enddo
enddo
return
end subroutine fdderivs_fh
#elif (ghost_width == 4)
! sixth order code

View File

@@ -19,60 +19,48 @@
!~~~~~~~> Local variable:
integer :: i,j,k
real*8 :: lgxx,lgyy,lgzz,ldetg
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA,lscale
real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~>
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
gxx = dxx + ONE
gyy = dyy + ONE
gzz = dzz + ONE
lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE
detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
gupxx = ( gyy * gzz - gyz * gyz ) / detg
gupxy = - ( gxy * gzz - gyz * gxz ) / detg
gupxz = ( gxy * gyz - gyy * gxz ) / detg
gupyy = ( gxx * gzz - gxz * gxz ) / detg
gupyz = - ( gxx * gyz - gxy * gxz ) / detg
gupzz = ( gxx * gyy - gxy * gxy ) / detg
ldetg = lgxx * lgyy * lgzz &
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
- gxz(i,j,k) * lgyy * gxz(i,j,k) &
- gxy(i,j,k) * gxy(i,j,k) * lgzz &
- lgxx * gyz(i,j,k) * gyz(i,j,k)
trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+ TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
Axx = Axx - F1o3 * gxx * trA
Axy = Axy - F1o3 * gxy * trA
Axz = Axz - F1o3 * gxz * trA
Ayy = Ayy - F1o3 * gyy * trA
Ayz = Ayz - F1o3 * gyz * trA
Azz = Azz - F1o3 * gzz * trA
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
detg = ONE / ( detg ** F1o3 )
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
gxx = gxx * detg
gxy = gxy * detg
gxz = gxz * detg
gyy = gyy * detg
gyz = gyz * detg
gzz = gzz * detg
lscale = ONE / ( ldetg ** F1o3 )
dxx(i,j,k) = lgxx * lscale - ONE
gxy(i,j,k) = gxy(i,j,k) * lscale
gxz(i,j,k) = gxz(i,j,k) * lscale
dyy(i,j,k) = lgyy * lscale - ONE
gyz(i,j,k) = gyz(i,j,k) * lscale
dzz(i,j,k) = lgzz * lscale - ONE
enddo
enddo
enddo
dxx = gxx - ONE
dyy = gyy - ONE
dzz = gzz - ONE
return
@@ -95,70 +83,50 @@
!~~~~~~~> Local variable:
integer :: i,j,k
real*8 :: lgxx,lgyy,lgzz,lscale
real*8 :: lgxy,lgxz,lgyz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA
real*8, dimension(ex(1),ex(2),ex(3)) :: trA
real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~>
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
gxx = dxx + ONE
gyy = dyy + ONE
gzz = dzz + ONE
! for g
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
! for g: normalize determinant first
lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE
lgxy = gxy(i,j,k)
lgxz = gxz(i,j,k)
lgyz = gyz(i,j,k)
gupzz = ONE / ( gupzz ** F1o3 )
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
gxx = gxx * gupzz
gxy = gxy * gupzz
gxz = gxz * gupzz
gyy = gyy * gupzz
gyz = gyz * gupzz
gzz = gzz * gupzz
lscale = ONE / ( lscale ** F1o3 )
dxx = gxx - ONE
dyy = gyy - ONE
dzz = gzz - ONE
! for A
lgxx = lgxx * lscale
lgxy = lgxy * lscale
lgxz = lgxz * lscale
lgyy = lgyy * lscale
lgyz = lgyz * lscale
lgzz = lgzz * lscale
gupxx = ( gyy * gzz - gyz * gyz )
gupxy = - ( gxy * gzz - gyz * gxz )
gupxz = ( gxy * gyz - gyy * gxz )
gupyy = ( gxx * gzz - gxz * gxz )
gupyz = - ( gxx * gyz - gxy * gxz )
gupzz = ( gxx * gyy - gxy * gxy )
dxx(i,j,k) = lgxx - ONE
gxy(i,j,k) = lgxy
gxz(i,j,k) = lgxz
dyy(i,j,k) = lgyy - ONE
gyz(i,j,k) = lgyz
dzz(i,j,k) = lgzz - ONE
trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+ TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
! for A: trace-free using normalized metric (det=1, no division needed)
lgupxx = ( lgyy * lgzz - lgyz * lgyz )
lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
lgupxz = ( lgxy * lgyz - lgyy * lgxz )
lgupyy = ( lgxx * lgzz - lgxz * lgxz )
lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
lgupzz = ( lgxx * lgyy - lgxy * lgxy )
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
enddo
enddo
enddo
Axx = Axx - F1o3 * gxx * trA
Axy = Axy - F1o3 * gxy * trA
Axz = Axz - F1o3 * gxz * trA
Ayy = Ayy - F1o3 * gyy * trA
Ayz = Ayz - F1o3 * gyz * trA
Azz = Azz - F1o3 * gzz * trA
return

View File

@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1112,65 +1118,64 @@ end subroutine d2dump
! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------
subroutine polint(xa, ya, x, y, dy, ordn)
subroutine polint(xa,ya,x,y,dy,ordn)
implicit none
integer, intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: xa, ya
!~~~~~~> Input Parameter:
integer,intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: xa,ya
real*8, intent(in) :: x
real*8, intent(out) :: y, dy
real*8, intent(out) :: y,dy
integer :: i, m, ns, n_m
real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val
!~~~~~~> Other parameter:
c = ya
d = ya
ho = xa - x
integer :: m,n,ns
real*8, dimension(ordn) :: c,d,den,ho
real*8 :: dif,dift
ns = 1
dif = abs(x - xa(1))
!~~~~~~>
do i = 2, ordn
dift = abs(x - xa(i))
if (dift < dif) then
ns = i
dif = dift
end if
n=ordn
m=ordn
c=ya
d=ya
ho=xa-x
ns=1
dif=abs(x-xa(1))
do m=1,n
dift=abs(x-xa(m))
if(dift < dif) then
ns=m
dif=dift
end if
end do
y = ya(ns)
ns = ns - 1
do m = 1, ordn - 1
n_m = ordn - m
do i = 1, n_m
hp = ho(i)
h = ho(i+m)
den_val = hp - h
if (den_val == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
den_val = (c(i+1) - d(i)) / den_val
d(i) = h * den_val
c(i) = hp * den_val
end do
if (2 * ns < n_m) then
dy = c(ns + 1)
y=ya(ns)
ns=ns-1
do m=1,n-1
den(1:n-m)=ho(1:n-m)-ho(1+m:n)
if (any(den(1:n-m) == 0.0))then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
endif
den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
d(1:n-m)=ho(1+m:n)*den(1:n-m)
c(1:n-m)=ho(1:n-m)*den(1:n-m)
if (2*ns < n-m) then
dy=c(ns+1)
else
dy = d(ns)
ns = ns - 1
dy=d(ns)
ns=ns-1
end if
y = y + dy
y=y+dy
end do
return
end subroutine polint
!------------------------------------------------------------------------------
!
@@ -1178,37 +1183,35 @@ end subroutine d2dump
!
!------------------------------------------------------------------------------
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
!~~~~~~> Other parameters:
integer :: i,m
real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp
m=size(x1a)
do i=1,m
yntmp=ya(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do j=1,ordn
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do
call polint(x2a, ymtmp, x2, y, dy, ordn)
#endif
call polint(x1a,ymtmp,x1,y,dy,ordn)
return
end subroutine polin2
!------------------------------------------------------------------------------
!
@@ -1216,15 +1219,18 @@ end subroutine d2dump
!
!------------------------------------------------------------------------------
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
!~~~~~~> Other parameters:
integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
@@ -1233,33 +1239,24 @@ end subroutine d2dump
m=size(x1a)
n=size(x2a)
do i=1,m
do j=1,n
yqtmp=ya(i,j,:)
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
end do
yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j, k
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do k=1,ordn
do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
end do
end do
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
end do
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif
call polint(x1a,ymtmp,x1,y,dy,ordn)
return
end subroutine polin3
!--------------------------------------------------------------------------------------
! calculate L2norm

View File

@@ -215,99 +215,6 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
end subroutine kodis
!-----------------------------------------------------------------------------
! kodis variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine kodis_fh(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps,fh)
implicit none
! argument variables
integer,intent(in) :: Symmetry
integer,dimension(3),intent(in)::ex
real*8, dimension(1:3), intent(in) :: SoA
double precision,intent(in),dimension(ex(1))::X
double precision,intent(in),dimension(ex(2))::Y
double precision,intent(in),dimension(ex(3))::Z
double precision,intent(in),dimension(ex(1),ex(2),ex(3))::f
double precision,intent(inout),dimension(ex(1),ex(2),ex(3))::f_rhs
real*8,intent(in) :: eps
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
! local variables
integer :: imin,jmin,kmin,imax,jmax,kmax
integer :: i,j,k
real*8 :: dX,dY,dZ
real*8, parameter :: ONE=1.d0,SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8,parameter::cof=6.4d1 ! 2^6
integer, parameter :: NO_SYMM=0, OCTANT=2
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry == OCTANT .and. dabs(X(1)) < dX) imin = -2
if(Symmetry == OCTANT .and. dabs(Y(1)) < dY) jmin = -2
call symmetry_bd(3,ex,f,fh,SoA)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
#if 0
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dX/cof * ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dY/cof * ( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dZ/cof * ( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )
#else
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
#endif
endif
enddo
enddo
enddo
return
end subroutine kodis_fh
#elif (ghost_width == 4)
! sixth order code
!------------------------------------------------------------------------------------------------------------------------------

View File

@@ -487,160 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
end subroutine lopsided
!-----------------------------------------------------------------------------
! lopsided variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine lopsided_fh(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,fh)
implicit none
!~~~~~~> Input parameters:
integer, intent(in) :: ex(1:3),Symmetry
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
!~~~~~~> local variables:
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
call symmetry_bd(3,ex,f,fh,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
#if 0
!! old code - same as original lopsided
#else
!! new code, 2012dec27, based on bam
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
endif
elseif(Sfx(i,j,k) < ZEO)then
if(i-3 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
endif
endif
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
endif
elseif(Sfy(i,j,k) < ZEO)then
if(j-3 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
endif
endif
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
endif
elseif(Sfz(i,j,k) < ZEO)then
if(k-3 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
#endif
enddo
enddo
enddo
return
end subroutine lopsided_fh
#elif (ghost_width == 4)
! sixth order code
! Compute advection terms in right hand sides of field equations

View File

@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\

View File

@@ -7,8 +7,9 @@
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl
## Aggressive optimization flags:
## -O3: Maximum optimization
@@ -16,10 +17,10 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
-fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx

View File

@@ -15,13 +15,12 @@ import subprocess
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
#NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 64
BUILD_JOBS = 104
##################################################################