Fix potential division by zero in reta_val calculation and enable NaN checks

Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.
Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs
2026-01-19 20:29:48 +08:00 · 2026-01-19 20:03:22 +08:00 · 2026-01-19 19:22:52 +08:00 · 2026-01-19 17:14:28 +08:00 · 2026-01-19 16:39:24 +08:00
14 changed files with 2716 additions and 2044 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 48                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -5,7 +5,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <string>
 #include <cstring>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
@@ -61,110 +60,13 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b,
  F = dvector(0, ntotal - 1);
  allocate_derivs(&u, ntotal);
  allocate_derivs(&v, ntotal);
  // Allocate workspace buffers for hot-path allocation elimination
  int N = maximum3(n1, n2, n3);
  int maxn = maximum2(n1, n2);
  // LineRelax_be workspace (sized for n2)
  ws_diag_be = new double[n2];
  ws_e_be = new double[n2 - 1];
  ws_f_be = new double[n2 - 1];
  ws_b_be = new double[n2];
  ws_x_be = new double[n2];
  // LineRelax_al workspace (sized for n1)
  ws_diag_al = new double[n1];
  ws_e_al = new double[n1 - 1];
  ws_f_al = new double[n1 - 1];
  ws_b_al = new double[n1];
  ws_x_al = new double[n1];
  // ThomasAlgorithm workspace (sized for max(n1,n2))
  ws_thomas_y = new double[maxn];
  // JFD_times_dv workspace (sized for nvar)
  ws_jfd_values = dvector(0, nvar - 1);
  allocate_derivs(&ws_jfd_dU, nvar);
  allocate_derivs(&ws_jfd_U, nvar);
  // chebft_Zeros workspace (sized for N+1)
  ws_cheb_c = dvector(0, N);
  // fourft workspace (sized for N/2+1 each)
  ws_four_a = dvector(0, N / 2);
  ws_four_b = dvector(0, N / 2);
  // Derivatives_AB3 workspace
  ws_deriv_p = dvector(0, N);
  ws_deriv_dp = dvector(0, N);
  ws_deriv_d2p = dvector(0, N);
  ws_deriv_q = dvector(0, N);
  ws_deriv_dq = dvector(0, N);
  ws_deriv_r = dvector(0, N);
  ws_deriv_dr = dvector(0, N);
  ws_deriv_indx = ivector(0, N);
  // F_of_v workspace
  ws_fov_sources = new double[n1 * n2 * n3];
  ws_fov_values = dvector(0, nvar - 1);
  allocate_derivs(&ws_fov_U, nvar);
  // J_times_dv workspace
  ws_jtdv_values = dvector(0, nvar - 1);
  allocate_derivs(&ws_jtdv_dU, nvar);
  allocate_derivs(&ws_jtdv_U, nvar);
 }
 TwoPunctures::~TwoPunctures()
 {
  int const nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
  int N = maximum3(n1, n2, n3);
  free_dvector(F, 0, ntotal - 1);
  free_derivs(&u, ntotal);
  free_derivs(&v, ntotal);
  // Free workspace buffers
  delete[] ws_diag_be;
  delete[] ws_e_be;
  delete[] ws_f_be;
  delete[] ws_b_be;
  delete[] ws_x_be;
  delete[] ws_diag_al;
  delete[] ws_e_al;
  delete[] ws_f_al;
  delete[] ws_b_al;
  delete[] ws_x_al;
  delete[] ws_thomas_y;
  free_dvector(ws_jfd_values, 0, nvar - 1);
  free_derivs(&ws_jfd_dU, nvar);
  free_derivs(&ws_jfd_U, nvar);
  free_dvector(ws_cheb_c, 0, N);
  free_dvector(ws_four_a, 0, N / 2);
  free_dvector(ws_four_b, 0, N / 2);
  free_dvector(ws_deriv_p, 0, N);
  free_dvector(ws_deriv_dp, 0, N);
  free_dvector(ws_deriv_d2p, 0, N);
  free_dvector(ws_deriv_q, 0, N);
  free_dvector(ws_deriv_dq, 0, N);
  free_dvector(ws_deriv_r, 0, N);
  free_dvector(ws_deriv_dr, 0, N);
  free_ivector(ws_deriv_indx, 0, N);
  delete[] ws_fov_sources;
  free_dvector(ws_fov_values, 0, nvar - 1);
  free_derivs(&ws_fov_U, nvar);
  free_dvector(ws_jtdv_values, 0, nvar - 1);
  free_derivs(&ws_jtdv_dU, nvar);
  free_derivs(&ws_jtdv_U, nvar);
 }
 void TwoPunctures::Solve()
@@ -753,7 +655,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
  int k, j, isignum;
  double fac, sum, Pion, *c;
-  c = ws_cheb_c;
+  c = dvector(0, n);
  Pion = Pi / n;
  if (inv == 0)
  {
@@ -784,6 +686,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
  }
  for (j = 0; j < n; j++)
    u[j] = c[j];
  free_dvector(c, 0, n);
 }
 /* --------------------------------------------------------------------------*/
@@ -871,8 +774,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
  double x, x1, fac, Pi_fac, *a, *b;
  M = N / 2;
-  a = ws_four_a;
+  a = dvector(0, M);
-  b = ws_four_b - 1; /* offset to match dvector(1,M) indexing */
+  b = dvector(1, M); /* Actually: b=vector(1,M-1) but this is problematic if M=1*/
  fac = 1. / M;
  Pi_fac = Pi * fac;
  if (inv == 0)
@@ -921,6 +824,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
      iy = -iy;
    }
  }
  free_dvector(a, 0, M);
  free_dvector(b, 1, M);
 }
 /* -----------------------------------------*/
@@ -1213,14 +1118,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
  double *p, *dp, *d2p, *q, *dq, *r, *dr;
  N = maximum3(n1, n2, n3);
-  p = ws_deriv_p;
+  p = dvector(0, N);
-  dp = ws_deriv_dp;
+  dp = dvector(0, N);
-  d2p = ws_deriv_d2p;
+  d2p = dvector(0, N);
-  q = ws_deriv_q;
+  q = dvector(0, N);
-  dq = ws_deriv_dq;
+  dq = dvector(0, N);
-  r = ws_deriv_r;
+  r = dvector(0, N);
-  dr = ws_deriv_dr;
+  dr = dvector(0, N);
-  indx = ws_deriv_indx;
+  indx = ivector(0, N);
  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -1303,6 +1208,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
      }
    }
  }
  free_dvector(p, 0, N);
  free_dvector(dp, 0, N);
  free_dvector(d2p, 0, N);
  free_dvector(q, 0, N);
  free_dvector(dq, 0, N);
  free_dvector(r, 0, N);
  free_dvector(dr, 0, N);
  free_ivector(indx, 0, N);
 }
 /* --------------------------------------------------------------------------*/
 void TwoPunctures::Newton(int const nvar, int const n1, int const n2, int const n3,
@@ -1371,11 +1284,10 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
  derivs U;
  double *sources;
-  values = ws_fov_values;
+  values = dvector(0, nvar - 1);
-  U = ws_fov_U;
+  allocate_derivs(&U, nvar);
-  sources = ws_fov_sources;
+  sources = (double *)calloc(n1 * n2 * n3, sizeof(double));
  memset(sources, 0, n1 * n2 * n3 * sizeof(double));
  if (0)
  {
    double *s_x, *s_y, *s_z;
@@ -1530,6 +1442,9 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
  {
    fclose(debugfile);
  }
  free(sources);
  free_dvector(values, 0, nvar - 1);
  free_derivs(&U, nvar);
 }
 /* --------------------------------------------------------------------------*/
 double TwoPunctures::norm_inf(double const *F, int const ntotal)
@@ -1935,12 +1850,11 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
  Derivatives_AB3(nvar, n1, n2, n3, dv);
  values = ws_jtdv_values;
  dU = ws_jtdv_dU;
  U = ws_jtdv_U;
  for (i = 0; i < n1; i++)
  {
    values = dvector(0, nvar - 1);
    allocate_derivs(&dU, nvar);
    allocate_derivs(&U, nvar);
    for (j = 0; j < n2; j++)
    {
      for (k = 0; k < n3; k++)
@@ -1994,6 +1908,9 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
        }
      }
    }
    free_dvector(values, 0, nvar - 1);
    free_derivs(&dU, nvar);
    free_derivs(&U, nvar);
  }
 }
 /* --------------------------------------------------------------------------*/
@@ -2040,11 +1957,17 @@ void TwoPunctures::LineRelax_be(double *dv,
 {
  int j, m, Ic, Ip, Im, col, ivar;
-  double *diag = ws_diag_be;
+  double *diag = new double[n2];
-  double *e = ws_e_be;     /* above diagonal */
+  double *e = new double[n2 - 1]; /* above diagonal */
-  double *f = ws_f_be;     /* below diagonal */
+  double *f = new double[n2 - 1]; /* below diagonal */
-  double *b = ws_b_be;     /* rhs */
+  double *b = new double[n2];     /* rhs */
-  double *x = ws_x_be;     /* solution vector */
+  double *x = new double[n2];     /* solution vector */
  //  gsl_vector *diag = gsl_vector_alloc(n2);
  //  gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */
  //  gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */
  //  gsl_vector *b = gsl_vector_alloc(n2);   /* rhs */
  //  gsl_vector *x = gsl_vector_alloc(n2);   /* solution vector */
  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -2054,35 +1977,62 @@ void TwoPunctures::LineRelax_be(double *dv,
    }
    diag[n2 - 1] = 0;
    //    gsl_vector_set_zero(diag);
    //    gsl_vector_set_zero(e);
    //    gsl_vector_set_zero(f);
    for (j = 0; j < n2; j++)
    {
      Ip = Index(ivar, i, j + 1, k, nvar, n1, n2, n3);
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      Im = Index(ivar, i, j - 1, k, nvar, n1, n2, n3);
      b[j] = rhs[Ic];
      //      gsl_vector_set(b,j,rhs[Ic]);
      for (m = 0; m < ncols[Ic]; m++)
      {
        col = cols[Ic][m];
        if (col != Ip && col != Ic && col != Im)
          b[j] -= JFD[Ic][m] * dv[col];
        //          *gsl_vector_ptr(b, j) -= JFD[Ic][m] * dv[col];
        else
        {
          if (col == Im && j > 0)
            f[j - 1] = JFD[Ic][m];
          //            gsl_vector_set(f,j-1,JFD[Ic][m]);
          if (col == Ic)
            diag[j] = JFD[Ic][m];
          //            gsl_vector_set(diag,j,JFD[Ic][m]);
          if (col == Ip && j < n2 - 1)
            e[j] = JFD[Ic][m];
          //            gsl_vector_set(e,j,JFD[Ic][m]);
        }
      }
    }
    //          A x = b
    //          A = ( d_0 e_0  0   0  )
    //              ( f_0 d_1 e_1  0  )
    //              (  0  f_1 d_2 e_2 )
    //              (  0   0  f_2 d_3 )
    //
    ThomasAlgorithm(n2, f, diag, e, x, b);
    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
    for (j = 0; j < n2; j++)
    {
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      dv[Ic] = x[j];
      //      dv[Ic] = gsl_vector_get(x, j);
    }
  }
  delete[] diag;
  delete[] e;
  delete[] f;
  delete[] b;
  delete[] x;
  //  gsl_vector_free(diag);
  //  gsl_vector_free(e);
  //  gsl_vector_free(f);
  //  gsl_vector_free(b);
  //  gsl_vector_free(x);
 }
 /* --------------------------------------------------------------------------*/
 void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
@@ -2099,8 +2049,8 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
      ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp;
  derivs dU, U;
-  dU = ws_jfd_dU;
+  allocate_derivs(&dU, nvar);
-  U = ws_jfd_U;
+  allocate_derivs(&U, nvar);
  if (k < 0)
    k = k + n3;
@@ -2218,6 +2168,9 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
  LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
  for (ivar = 0; ivar < nvar; ivar++)
    values[ivar] *= FAC;
  free_derivs(&dU, nvar);
  free_derivs(&U, nvar);
 }
 #undef FAC
 /*-----------------------------------------------------------*/
@@ -2249,11 +2202,17 @@ void TwoPunctures::LineRelax_al(double *dv,
 {
  int i, m, Ic, Ip, Im, col, ivar;
-  double *diag = ws_diag_al;
+  double *diag = new double[n1];
-  double *e = ws_e_al;     /* above diagonal */
+  double *e = new double[n1 - 1]; /* above diagonal */
-  double *f = ws_f_al;     /* below diagonal */
+  double *f = new double[n1 - 1]; /* below diagonal */
-  double *b = ws_b_al;     /* rhs */
+  double *b = new double[n1];     /* rhs */
-  double *x = ws_x_al;     /* solution vector */
+  double *x = new double[n1];     /* solution vector */
  //  gsl_vector *diag = gsl_vector_alloc(n1);
  //  gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */
  //  gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */
  //  gsl_vector *b = gsl_vector_alloc(n1);   /* rhs */
  //  gsl_vector *x = gsl_vector_alloc(n1);   /* solution vector */
  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -2263,35 +2222,57 @@ void TwoPunctures::LineRelax_al(double *dv,
    }
    diag[n1 - 1] = 0;
    //    gsl_vector_set_zero(diag);
    //    gsl_vector_set_zero(e);
    //    gsl_vector_set_zero(f);
    for (i = 0; i < n1; i++)
    {
      Ip = Index(ivar, i + 1, j, k, nvar, n1, n2, n3);
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      Im = Index(ivar, i - 1, j, k, nvar, n1, n2, n3);
      b[i] = rhs[Ic];
      //      gsl_vector_set(b,i,rhs[Ic]);
      for (m = 0; m < ncols[Ic]; m++)
      {
        col = cols[Ic][m];
        if (col != Ip && col != Ic && col != Im)
          b[i] -= JFD[Ic][m] * dv[col];
        //          *gsl_vector_ptr(b, i) -= JFD[Ic][m] * dv[col];
        else
        {
          if (col == Im && i > 0)
            f[i - 1] = JFD[Ic][m];
          //            gsl_vector_set(f,i-1,JFD[Ic][m]);
          if (col == Ic)
            diag[i] = JFD[Ic][m];
          //            gsl_vector_set(diag,i,JFD[Ic][m]);
          if (col == Ip && i < n1 - 1)
            e[i] = JFD[Ic][m];
          //            gsl_vector_set(e,i,JFD[Ic][m]);
        }
      }
    }
    ThomasAlgorithm(n1, f, diag, e, x, b);
    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
    for (i = 0; i < n1; i++)
    {
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      dv[Ic] = x[i];
      //      dv[Ic] = gsl_vector_get(x, i);
    }
  }
  delete[] diag;
  delete[] e;
  delete[] f;
  delete[] b;
  delete[] x;
  //  gsl_vector_free(diag);
  //  gsl_vector_free(e);
  //  gsl_vector_free(f);
  //  gsl_vector_free(b);
  //  gsl_vector_free(x);
 }
 /* -------------------------------------------------------------------------*/
 // a[N], b[N-1], c[N-1], x[N], q[N]
@@ -2303,29 +2284,44 @@ void TwoPunctures::LineRelax_al(double *dv,
 //"Parallel Scientific Computing in C++ and MPI" P361
 void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q)
 {
  // In-place Thomas algorithm: uses a[] as d workspace, b[] as l workspace.
  // c[] is already u (above-diagonal). ws_thomas_y is pre-allocated workspace.
  int i;
-  double *y = ws_thomas_y;
+  double *l, *u, *d, *y;
  l = new double[N - 1];
  u = new double[N - 1];
  d = new double[N];
  y = new double[N];
  /* LU Decomposition */
  d[0] = a[0];
  u[0] = c[0];
  /* LU Decomposition (in-place: a becomes d, b becomes l) */
  for (i = 0; i < N - 2; i++)
  {
-    b[i] = b[i] / a[i];
+    l[i] = b[i] / d[i];
-    a[i + 1] = a[i + 1] - b[i] * c[i];
+    d[i + 1] = a[i + 1] - l[i] * u[i];
    u[i + 1] = c[i + 1];
  }
-  b[N - 2] = b[N - 2] / a[N - 2];
+
-  a[N - 1] = a[N - 1] - b[N - 2] * c[N - 2];
+  l[N - 2] = b[N - 2] / d[N - 2];
  d[N - 1] = a[N - 1] - l[N - 2] * u[N - 2];
  /* Forward Substitution [L][y] = [q] */
  y[0] = q[0];
  for (i = 1; i < N; i++)
-    y[i] = q[i] - b[i - 1] * y[i - 1];
+    y[i] = q[i] - l[i - 1] * y[i - 1];
  /* Backward Substitution [U][x] = [y] */
-  x[N - 1] = y[N - 1] / a[N - 1];
+  x[N - 1] = y[N - 1] / d[N - 1];
  for (i = N - 2; i >= 0; i--)
-    x[i] = (y[i] - c[i] * x[i + 1]) / a[i];
+    x[i] = (y[i] - u[i] * x[i + 1]) / d[i];
  delete[] l;
  delete[] u;
  delete[] d;
  delete[] y;
  return;
 }
 // --------------------------------------------------------------------------*/
 // Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -42,33 +42,6 @@ private:
       int ntotal;
       // Pre-allocated workspace buffers for hot-path allocation elimination
       // LineRelax_be workspace (sized for n2)
       double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be;
       // LineRelax_al workspace (sized for n1)
       double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
       // ThomasAlgorithm workspace (sized for max(n1,n2))
       double *ws_thomas_y;
       // JFD_times_dv workspace (sized for nvar)
       double *ws_jfd_values;
       derivs ws_jfd_dU, ws_jfd_U;
       // chebft_Zeros workspace (sized for max(n1,n2,n3)+1)
       double *ws_cheb_c;
       // fourft workspace (sized for max(n1,n2,n3)/2+1 each)
       double *ws_four_a, *ws_four_b;
       // Derivatives_AB3 workspace
       double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
       double *ws_deriv_q, *ws_deriv_dq;
       double *ws_deriv_r, *ws_deriv_dr;
       int *ws_deriv_indx;
       // F_of_v workspace
       double *ws_fov_sources;
       double *ws_fov_values;
       derivs ws_fov_U;
       // J_times_dv workspace
       double *ws_jtdv_values;
       derivs ws_jtdv_dU, ws_jtdv_U;
       struct parameters
       {
              int nvar, n1, n2, n3;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
--- a/AMSS_NCKU_source/bssn_rhs_legacy.f90
+++ b/AMSS_NCKU_source/bssn_rhs_legacy.f90
--- a/AMSS_NCKU_source/bssn_rhs_opt.f90
+++ b/AMSS_NCKU_source/bssn_rhs_opt.f90
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -1103,103 +1103,6 @@
  end subroutine fderivs
 !-----------------------------------------------------------------------------
 ! fderivs variant: reuses caller-provided fh work array (memory pool)
 !-----------------------------------------------------------------------------
  subroutine fderivs_fh(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3, &
                         symmetry,onoff,fh)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: fx,fy,fz
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
  real*8 :: dX,dY,dZ
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f,fh,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  fx = ZEO
  fy = ZEO
  fz = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 #if 0
   if(i+2 <= imax .and. i-2 >= imin)then
      fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
    elseif(i+1 <= imax .and. i-1 >= imin)then
      fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
    endif
        if(j+2 <= jmax .and. j-2 >= jmin)then
      fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
    elseif(j+1 <= jmax .and. j-1 >= jmin)then
     fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
    endif
        if(k+2 <= kmax .and. k-2 >= kmin)then
      fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
    elseif(k+1 <= kmax .and. k-1 >= kmin)then
      fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
    endif
 #else
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
      fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
      fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
      fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
      fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
   endif
 #endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_fh
 !-----------------------------------------------------------------------------
 !
 ! single derivatives dx
 !
@@ -2037,162 +1940,6 @@
  end subroutine fddyz
 !-----------------------------------------------------------------------------
 ! fdderivs variant: reuses caller-provided fh work array (memory pool)
 !-----------------------------------------------------------------------------
  subroutine fdderivs_fh(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
                          SYM1,SYM2,SYM3,symmetry,onoff,fh)
  implicit none
  integer,                             intent(in ):: ex(1:3),symmetry,onoff
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ):: f
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out):: fxx,fxy,fxz,fyy,fyz,fzz
  real*8,                              intent(in ):: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                              intent(in ):: SYM1,SYM2,SYM3
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
  real*8 :: dX,dY,dZ
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1
  real*8, parameter :: F8=8.d0, F16=1.6d1, F30=3.d1
  real*8, parameter :: F1o12=ONE/1.2d1, F1o144=ONE/1.44d2
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f,fh,SoA)
  Sdxdx =  ONE /( dX * dX )
  Sdydy =  ONE /( dY * dY )
  Sdzdz =  ONE /( dZ * dZ )
  Fdxdx = F1o12 /( dX * dX )
  Fdydy = F1o12 /( dY * dY )
  Fdzdz = F1o12 /( dZ * dZ )
  Sdxdy = F1o4 /( dX * dY )
  Sdxdz = F1o4 /( dX * dZ )
  Sdydz = F1o4 /( dY * dZ )
  Fdxdy = F1o144 /( dX * dY )
  Fdxdz = F1o144 /( dX * dZ )
  Fdydz = F1o144 /( dY * dZ )
  fxx = ZEO
  fyy = ZEO
  fzz = ZEO
  fxy = ZEO
  fxz = ZEO
  fyz = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 #if 0
   if(i+2 <= imax .and. i-2 >= imin)then
   fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
                       -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
   elseif(i+1 <= imax .and. i-1 >= imin)then
   fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
   endif
        if(j+2 <= jmax .and. j-2 >= jmin)then
   fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
                       -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
   elseif(j+1 <= jmax .and. j-1 >= jmin)then
   fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
   endif
        if(k+2 <= kmax .and. k-2 >= kmin)then
   fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
                       -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
   elseif(k+1 <= kmax .and. k-1 >= kmin)then
   fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
   endif
       if(i+2 <= imax .and. i-2 >= imin .and. j+2 <= jmax .and. j-2 >= jmin)then
   fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
                       -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
                       +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
                       -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
   elseif(i+1 <= imax .and. i-1 >= imin .and. j+1 <= jmax .and. j-1 >= jmin)then
   fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
   endif
       if(i+2 <= imax .and. i-2 >= imin .and. k+2 <= kmax .and. k-2 >= kmin)then
   fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
                       -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
                       +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
                       -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
   elseif(i+1 <= imax .and. i-1 >= imin .and. k+1 <= kmax .and. k-1 >= kmin)then
   fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
   endif
       if(j+2 <= jmax .and. j-2 >= jmin .and. k+2 <= kmax .and. k-2 >= kmin)then
   fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
                       -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
                       +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
                       -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
   elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
   fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
   endif
 #else
 ! for bam comparison
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
   fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
                       -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
   fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
                       -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
   fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
                       -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
   fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
                       -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
                       +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
                       -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
   fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
                       -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
                       +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
                       -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
   fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
                       -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
                       +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
                       -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
   fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
   fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
   fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
   fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
   fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
   fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
   endif
 #endif
   enddo
   enddo
   enddo
  return
  end subroutine fdderivs_fh
 #elif (ghost_width == 4)
 ! sixth order code
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -19,60 +19,48 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
-  real*8 :: lgxx,lgyy,lgzz,ldetg
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: ltrA,lscale
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
-    lgxx = dxx(i,j,k) + ONE
+  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-    lgyy = dyy(i,j,k) + ONE
+          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-    lgzz = dzz(i,j,k) + ONE
+  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
  gupzz =   ( gxx * gyy - gxy * gxy ) / detg
-    ldetg =  lgxx * lgyy * lgzz &
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
+  Axz = Axz - F1o3 * gxz * trA
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+  Azz = Azz - F1o3 * gzz * trA
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+  detg = ONE / ( detg ** F1o3 ) 
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+  gxx = gxx * detg
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
+  gxy = gxy * detg
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
+  gxz = gxz * detg
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
+  gyy = gyy * detg
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
+  gyz = gyz * detg
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+  gzz = gzz * detg
-    lscale = ONE / ( ldetg ** F1o3 )
+  dxx = gxx - ONE
-
+  dyy = gyy - ONE
-    dxx(i,j,k) = lgxx * lscale - ONE
+  dzz = gzz - ONE
    gxy(i,j,k) = gxy(i,j,k) * lscale
    gxz(i,j,k) = gxz(i,j,k) * lscale
    dyy(i,j,k) = lgyy * lscale - ONE
    gyz(i,j,k) = gyz(i,j,k) * lscale
    dzz(i,j,k) = lgzz * lscale - ONE
  enddo
  enddo
  enddo
  return
@@ -95,70 +83,50 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
-  real*8 :: lgxx,lgyy,lgzz,lscale
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgxy,lgxz,lgyz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
  real*8 :: ltrA
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
 ! for g
  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-! for g: normalize determinant first
+  gupzz = ONE / ( gupzz ** F1o3 ) 
    lgxx = dxx(i,j,k) + ONE
    lgyy = dyy(i,j,k) + ONE
    lgzz = dzz(i,j,k) + ONE
    lgxy = gxy(i,j,k)
    lgxz = gxz(i,j,k)
    lgyz = gyz(i,j,k)
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+  gxx = gxx * gupzz
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
+  gxy = gxy * gupzz
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+  gxz = gxz * gupzz
  gyy = gyy * gupzz
  gyz = gyz * gupzz
  gzz = gzz * gupzz
-    lscale = ONE / ( lscale ** F1o3 )
+  dxx = gxx - ONE
  dyy = gyy - ONE
  dzz = gzz - ONE
 ! for A  
-    lgxx = lgxx * lscale
+  gupxx =   ( gyy * gzz - gyz * gyz )
-    lgxy = lgxy * lscale
+  gupxy = - ( gxy * gzz - gyz * gxz )
-    lgxz = lgxz * lscale
+  gupxz =   ( gxy * gyz - gyy * gxz )
-    lgyy = lgyy * lscale
+  gupyy =   ( gxx * gzz - gxz * gxz )
-    lgyz = lgyz * lscale
+  gupyz = - ( gxx * gyz - gxy * gxz )
-    lgzz = lgzz * lscale
+  gupzz =   ( gxx * gyy - gxy * gxy )
-    dxx(i,j,k) = lgxx - ONE
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-    gxy(i,j,k) = lgxy
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
    gxz(i,j,k) = lgxz
    dyy(i,j,k) = lgyy - ONE
    gyz(i,j,k) = lgyz
    dzz(i,j,k) = lgzz - ONE
-! for A: trace-free using normalized metric (det=1, no division needed)
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
+  Axz = Axz - F1o3 * gxz * trA
-    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
+  Azz = Azz - F1o3 * gzz * trA
    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
  enddo
  enddo
  enddo
  return
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1112,65 +1118,64 @@ end subroutine d2dump
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------
-  subroutine polint(xa, ya, x, y, dy, ordn)
+  subroutine polint(xa,ya,x,y,dy,ordn)
  implicit none
-  integer, intent(in) :: ordn
+!~~~~~~> Input Parameter:
-  real*8, dimension(ordn), intent(in) :: xa, ya
+  integer,intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa,ya
  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
+  real*8, intent(out) :: y,dy
-  integer :: i, m, ns, n_m
+!~~~~~~> Other parameter:
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
-  c = ya
+  integer :: m,n,ns
-  d = ya
+  real*8, dimension(ordn) :: c,d,den,ho
-  ho = xa - x
+  real*8 :: dif,dift
-  ns = 1
+!~~~~~~>
  dif = abs(x - xa(1))
-  do i = 2, ordn
+  n=ordn
-    dift = abs(x - xa(i))
+  m=ordn
-    if (dift < dif) then
+
-      ns = i
+  c=ya
-      dif = dift
+  d=ya
-    end if
+  ho=xa-x
  ns=1
  dif=abs(x-xa(1))
  do m=1,n
   dift=abs(x-xa(m))
   if(dift < dif) then
    ns=m
    dif=dift
   end if
  end do
-  y = ya(ns)
+  y=ya(ns)
-  ns = ns - 1
+  ns=ns-1
-
+  do m=1,n-1
-  do m = 1, ordn - 1
+    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
-    n_m = ordn - m
+    if (any(den(1:n-m) == 0.0))then
-    do i = 1, n_m
+      write(*,*) 'failure in polint for point',x
-      hp = ho(i)
+      write(*,*) 'with input points: ',xa
-      h  = ho(i+m)
+      stop
-      den_val = hp - h
+    endif
-
+    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
-      if (den_val == 0.0d0) then
+    d(1:n-m)=ho(1+m:n)*den(1:n-m)
-        write(*,*) 'failure in polint for point',x
+    c(1:n-m)=ho(1:n-m)*den(1:n-m)
-        write(*,*) 'with input points: ',xa
+    if (2*ns < n-m) then
-        stop
+      dy=c(ns+1)
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
-      dy = d(ns)
+      dy=d(ns)
-      ns = ns - 1
+      ns=ns-1
    end if
-    y = y + dy
+    y=y+dy
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 !
@@ -1178,37 +1183,35 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
+
-#endif
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
@@ -1216,15 +1219,18 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1233,33 +1239,24 @@ end subroutine d2dump
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j, k
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do k=1,ordn
    do j=1,ordn
      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
    end do
  end do
-  do k=1,ordn
+
-    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -215,99 +215,6 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
  end subroutine kodis
 !-----------------------------------------------------------------------------
 ! kodis variant: reuses caller-provided fh work array (memory pool)
 !-----------------------------------------------------------------------------
 subroutine kodis_fh(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps,fh)
 implicit none
 ! argument variables
 integer,intent(in) :: Symmetry
 integer,dimension(3),intent(in)::ex
 real*8, dimension(1:3), intent(in) :: SoA
 double precision,intent(in),dimension(ex(1))::X
 double precision,intent(in),dimension(ex(2))::Y
 double precision,intent(in),dimension(ex(3))::Z
 double precision,intent(in),dimension(ex(1),ex(2),ex(3))::f
 double precision,intent(inout),dimension(ex(1),ex(2),ex(3))::f_rhs
 real*8,intent(in) :: eps
 real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
 ! local variables
 integer :: imin,jmin,kmin,imax,jmax,kmax
 integer :: i,j,k
 real*8  :: dX,dY,dZ
 real*8, parameter :: ONE=1.d0,SIX=6.d0,FIT=1.5d1,TWT=2.d1
 real*8,parameter::cof=6.4d1   ! 2^6
 integer, parameter :: NO_SYMM=0, OCTANT=2
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry == OCTANT .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry == OCTANT .and. dabs(Y(1)) < dY) jmin = -2
  call symmetry_bd(3,ex,f,fh,SoA)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
 #if 0
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dX/cof * (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dY/cof * (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dZ/cof * (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )
 #else
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
 #endif
  endif
  enddo
  enddo
  enddo
  return
  end subroutine kodis_fh
 #elif (ghost_width == 4)
 ! sixth order code
 !------------------------------------------------------------------------------------------------------------------------------
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,160 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! lopsided variant: reuses caller-provided fh work array (memory pool)
 !-----------------------------------------------------------------------------
 subroutine lopsided_fh(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,fh)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
 !~~~~~~> local variables:
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
  call symmetry_bd(3,ex,f,fh,SoA)
 ! upper bound set ex-1 only for efficiency,
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 #if 0
 !! old code - same as original lopsided
 #else
 !! new code, 2012dec27, based on bam
 ! x direction
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
 #endif
  enddo
  enddo
  enddo
  return
  end subroutine lopsided_fh
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -7,8 +7,9 @@
 filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
          -lpthread -lm -ldl
 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
@@ -16,10 +17,10 @@ LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
 ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
-               -align array64byte -fpp -I${MKLROOT}/include
+               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -15,13 +15,12 @@ import subprocess
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
-NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
+NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 #NUMACTL_CPU_BIND = "taskset -c 0-111"
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
-BUILD_JOBS = 64
+BUILD_JOBS = 104
 ##################################################################
Author	SHA1	Message	Date
CGH0S7	ed89bc029b	Fix potential division by zero in reta_val calculation and enable NaN checks Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.	2026-01-19 20:29:48 +08:00
CGH0S7	19274e93d1	Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs Refactored calc_derivs and calc_dderivs to include correct boundary handling logic matching the legacy code. Implemented fallback to 2nd order derivatives when near boundaries where 4th order stencils cannot be used. Added logic to initialize output arrays to zero to avoid uninitialized memory access.	2026-01-19 20:03:22 +08:00
CGH0S7	ae1a474cca	Fix compilation errors and complete logic in BSSN RHS optimization	2026-01-19 19:22:52 +08:00
CGH0S7	cbb8fb3a87	patched last commit	2026-01-19 17:14:28 +08:00
CGH0S7	4472d89a9f	Optimize bssn_rhs calculation with cache blocking and vectorization - Implemented cache blocking (BLK=8) in bssn_rhs_opt.f90 to improve L1/L2 cache hit rate. - Introduced bssn_rhs_opt.f90 module with vectorized derivative and physics kernels. - Renamed original implementation to bssn_rhs_legacy.f90 for fallback. - Updated bssn_rhs.f90 to act as a dispatcher, using the optimized path for ghost_width=3. - Updated makefile to include new source files. - Added DEBUG_NAN_CHECK macro to optionally disable NaN checks in production.	2026-01-19 16:39:24 +08:00