Fix potential division by zero in reta_val calculation and enable NaN checks

Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.
Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs
2026-01-19 20:29:48 +08:00 · 2026-01-19 20:03:22 +08:00 · 2026-01-19 19:22:52 +08:00 · 2026-01-19 17:14:28 +08:00 · 2026-01-19 16:39:24 +08:00
14 changed files with 2761 additions and 1690 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 48                             ## number of mpi processes used in the simulation

 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -5,7 +5,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <string>
-#include <cstring>
 #include <iostream>
 #include <iomanip>
 #include <fstream>
@@ -61,110 +60,13 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b,
  F = dvector(0, ntotal - 1);
  allocate_derivs(&u, ntotal);
  allocate_derivs(&v, ntotal);
-
-  // Allocate workspace buffers for hot-path allocation elimination
-  int N = maximum3(n1, n2, n3);
-  int maxn = maximum2(n1, n2);
-
-  // LineRelax_be workspace (sized for n2)
-  ws_diag_be = new double[n2];
-  ws_e_be = new double[n2 - 1];
-  ws_f_be = new double[n2 - 1];
-  ws_b_be = new double[n2];
-  ws_x_be = new double[n2];
-
-  // LineRelax_al workspace (sized for n1)
-  ws_diag_al = new double[n1];
-  ws_e_al = new double[n1 - 1];
-  ws_f_al = new double[n1 - 1];
-  ws_b_al = new double[n1];
-  ws_x_al = new double[n1];
-
-  // ThomasAlgorithm workspace (sized for max(n1,n2))
-  ws_thomas_y = new double[maxn];
-
-  // JFD_times_dv workspace (sized for nvar)
-  ws_jfd_values = dvector(0, nvar - 1);
-  allocate_derivs(&ws_jfd_dU, nvar);
-  allocate_derivs(&ws_jfd_U, nvar);
-
-  // chebft_Zeros workspace (sized for N+1)
-  ws_cheb_c = dvector(0, N);
-
-  // fourft workspace (sized for N/2+1 each)
-  ws_four_a = dvector(0, N / 2);
-  ws_four_b = dvector(0, N / 2);
-
-  // Derivatives_AB3 workspace
-  ws_deriv_p = dvector(0, N);
-  ws_deriv_dp = dvector(0, N);
-  ws_deriv_d2p = dvector(0, N);
-  ws_deriv_q = dvector(0, N);
-  ws_deriv_dq = dvector(0, N);
-  ws_deriv_r = dvector(0, N);
-  ws_deriv_dr = dvector(0, N);
-  ws_deriv_indx = ivector(0, N);
-
-  // F_of_v workspace
-  ws_fov_sources = new double[n1 * n2 * n3];
-  ws_fov_values = dvector(0, nvar - 1);
-  allocate_derivs(&ws_fov_U, nvar);
-
-  // J_times_dv workspace
-  ws_jtdv_values = dvector(0, nvar - 1);
-  allocate_derivs(&ws_jtdv_dU, nvar);
-  allocate_derivs(&ws_jtdv_U, nvar);
 }

 TwoPunctures::~TwoPunctures()
 {
-  int const nvar = 1, n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
-  int N = maximum3(n1, n2, n3);
-
  free_dvector(F, 0, ntotal - 1);
  free_derivs(&u, ntotal);
  free_derivs(&v, ntotal);
-
-  // Free workspace buffers
-  delete[] ws_diag_be;
-  delete[] ws_e_be;
-  delete[] ws_f_be;
-  delete[] ws_b_be;
-  delete[] ws_x_be;
-
-  delete[] ws_diag_al;
-  delete[] ws_e_al;
-  delete[] ws_f_al;
-  delete[] ws_b_al;
-  delete[] ws_x_al;
-
-  delete[] ws_thomas_y;
-
-  free_dvector(ws_jfd_values, 0, nvar - 1);
-  free_derivs(&ws_jfd_dU, nvar);
-  free_derivs(&ws_jfd_U, nvar);
-
-  free_dvector(ws_cheb_c, 0, N);
-
-  free_dvector(ws_four_a, 0, N / 2);
-  free_dvector(ws_four_b, 0, N / 2);
-
-  free_dvector(ws_deriv_p, 0, N);
-  free_dvector(ws_deriv_dp, 0, N);
-  free_dvector(ws_deriv_d2p, 0, N);
-  free_dvector(ws_deriv_q, 0, N);
-  free_dvector(ws_deriv_dq, 0, N);
-  free_dvector(ws_deriv_r, 0, N);
-  free_dvector(ws_deriv_dr, 0, N);
-  free_ivector(ws_deriv_indx, 0, N);
-
-  delete[] ws_fov_sources;
-  free_dvector(ws_fov_values, 0, nvar - 1);
-  free_derivs(&ws_fov_U, nvar);
-
-  free_dvector(ws_jtdv_values, 0, nvar - 1);
-  free_derivs(&ws_jtdv_dU, nvar);
-  free_derivs(&ws_jtdv_U, nvar);
 }

 void TwoPunctures::Solve()
@@ -753,7 +655,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
  int k, j, isignum;
  double fac, sum, Pion, *c;

-  c = ws_cheb_c;
+  c = dvector(0, n);
  Pion = Pi / n;
  if (inv == 0)
  {
@@ -784,6 +686,7 @@ void TwoPunctures::chebft_Zeros(double u[], int n, int inv)
  }
  for (j = 0; j < n; j++)
    u[j] = c[j];
+  free_dvector(c, 0, n);
 }

 /* --------------------------------------------------------------------------*/
@@ -871,8 +774,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
  double x, x1, fac, Pi_fac, *a, *b;

  M = N / 2;
-  a = ws_four_a;
-  b = ws_four_b - 1; /* offset to match dvector(1,M) indexing */
+  a = dvector(0, M);
+  b = dvector(1, M); /* Actually: b=vector(1,M-1) but this is problematic if M=1*/
  fac = 1. / M;
  Pi_fac = Pi * fac;
  if (inv == 0)
@@ -921,6 +824,8 @@ void TwoPunctures::fourft(double *u, int N, int inv)
      iy = -iy;
    }
  }
+  free_dvector(a, 0, M);
+  free_dvector(b, 1, M);
 }

 /* -----------------------------------------*/
@@ -1213,14 +1118,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
  double *p, *dp, *d2p, *q, *dq, *r, *dr;

  N = maximum3(n1, n2, n3);
-  p = ws_deriv_p;
-  dp = ws_deriv_dp;
-  d2p = ws_deriv_d2p;
-  q = ws_deriv_q;
-  dq = ws_deriv_dq;
-  r = ws_deriv_r;
-  dr = ws_deriv_dr;
-  indx = ws_deriv_indx;
+  p = dvector(0, N);
+  dp = dvector(0, N);
+  d2p = dvector(0, N);
+  q = dvector(0, N);
+  dq = dvector(0, N);
+  r = dvector(0, N);
+  dr = dvector(0, N);
+  indx = ivector(0, N);

  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -1303,6 +1208,14 @@ void TwoPunctures::Derivatives_AB3(int nvar, int n1, int n2, int n3, derivs v)
      }
    }
  }
+  free_dvector(p, 0, N);
+  free_dvector(dp, 0, N);
+  free_dvector(d2p, 0, N);
+  free_dvector(q, 0, N);
+  free_dvector(dq, 0, N);
+  free_dvector(r, 0, N);
+  free_dvector(dr, 0, N);
+  free_ivector(indx, 0, N);
 }
 /* --------------------------------------------------------------------------*/
 void TwoPunctures::Newton(int const nvar, int const n1, int const n2, int const n3,
@@ -1371,11 +1284,10 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
  derivs U;
  double *sources;

-  values = ws_fov_values;
-  U = ws_fov_U;
+  values = dvector(0, nvar - 1);
+  allocate_derivs(&U, nvar);

-  sources = ws_fov_sources;
-  memset(sources, 0, n1 * n2 * n3 * sizeof(double));
+  sources = (double *)calloc(n1 * n2 * n3, sizeof(double));
  if (0)
  {
    double *s_x, *s_y, *s_z;
@@ -1530,6 +1442,9 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
  {
    fclose(debugfile);
  }
+  free(sources);
+  free_dvector(values, 0, nvar - 1);
+  free_derivs(&U, nvar);
 }
 /* --------------------------------------------------------------------------*/
 double TwoPunctures::norm_inf(double const *F, int const ntotal)
@@ -1935,12 +1850,11 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl

  Derivatives_AB3(nvar, n1, n2, n3, dv);

-  values = ws_jtdv_values;
-  dU = ws_jtdv_dU;
-  U = ws_jtdv_U;
-
  for (i = 0; i < n1; i++)
  {
+    values = dvector(0, nvar - 1);
+    allocate_derivs(&dU, nvar);
+    allocate_derivs(&U, nvar);
    for (j = 0; j < n2; j++)
    {
      for (k = 0; k < n3; k++)
@@ -1994,6 +1908,9 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
        }
      }
    }
+    free_dvector(values, 0, nvar - 1);
+    free_derivs(&dU, nvar);
+    free_derivs(&U, nvar);
  }
 }
 /* --------------------------------------------------------------------------*/
@@ -2040,11 +1957,17 @@ void TwoPunctures::LineRelax_be(double *dv,
 {
  int j, m, Ic, Ip, Im, col, ivar;

-  double *diag = ws_diag_be;
-  double *e = ws_e_be;     /* above diagonal */
-  double *f = ws_f_be;     /* below diagonal */
-  double *b = ws_b_be;     /* rhs */
-  double *x = ws_x_be;     /* solution vector */
+  double *diag = new double[n2];
+  double *e = new double[n2 - 1]; /* above diagonal */
+  double *f = new double[n2 - 1]; /* below diagonal */
+  double *b = new double[n2];     /* rhs */
+  double *x = new double[n2];     /* solution vector */
+
+  //  gsl_vector *diag = gsl_vector_alloc(n2);
+  //  gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */
+  //  gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */
+  //  gsl_vector *b = gsl_vector_alloc(n2);   /* rhs */
+  //  gsl_vector *x = gsl_vector_alloc(n2);   /* solution vector */

  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -2054,35 +1977,62 @@ void TwoPunctures::LineRelax_be(double *dv,
    }
    diag[n2 - 1] = 0;

+    //    gsl_vector_set_zero(diag);
+    //    gsl_vector_set_zero(e);
+    //    gsl_vector_set_zero(f);
    for (j = 0; j < n2; j++)
    {
      Ip = Index(ivar, i, j + 1, k, nvar, n1, n2, n3);
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      Im = Index(ivar, i, j - 1, k, nvar, n1, n2, n3);
      b[j] = rhs[Ic];
+      //      gsl_vector_set(b,j,rhs[Ic]);
      for (m = 0; m < ncols[Ic]; m++)
      {
        col = cols[Ic][m];
        if (col != Ip && col != Ic && col != Im)
          b[j] -= JFD[Ic][m] * dv[col];
+        //          *gsl_vector_ptr(b, j) -= JFD[Ic][m] * dv[col];
        else
        {
          if (col == Im && j > 0)
            f[j - 1] = JFD[Ic][m];
+          //            gsl_vector_set(f,j-1,JFD[Ic][m]);
          if (col == Ic)
            diag[j] = JFD[Ic][m];
+          //            gsl_vector_set(diag,j,JFD[Ic][m]);
          if (col == Ip && j < n2 - 1)
            e[j] = JFD[Ic][m];
+          //            gsl_vector_set(e,j,JFD[Ic][m]);
        }
      }
    }
+    //          A x = b
+    //          A = ( d_0 e_0  0   0  )
+    //              ( f_0 d_1 e_1  0  )
+    //              (  0  f_1 d_2 e_2 )
+    //              (  0   0  f_2 d_3 )
+    //
    ThomasAlgorithm(n2, f, diag, e, x, b);
+    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
    for (j = 0; j < n2; j++)
    {
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      dv[Ic] = x[j];
+      //      dv[Ic] = gsl_vector_get(x, j);
    }
  }
+
+  delete[] diag;
+  delete[] e;
+  delete[] f;
+  delete[] b;
+  delete[] x;
+  //  gsl_vector_free(diag);
+  //  gsl_vector_free(e);
+  //  gsl_vector_free(f);
+  //  gsl_vector_free(b);
+  //  gsl_vector_free(x);
 }
 /* --------------------------------------------------------------------------*/
 void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
@@ -2099,8 +2049,8 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
      ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp;
  derivs dU, U;

-  dU = ws_jfd_dU;
-  U = ws_jfd_U;
+  allocate_derivs(&dU, nvar);
+  allocate_derivs(&U, nvar);

  if (k < 0)
    k = k + n3;
@@ -2218,6 +2168,9 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
  LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
  for (ivar = 0; ivar < nvar; ivar++)
    values[ivar] *= FAC;
+
+  free_derivs(&dU, nvar);
+  free_derivs(&U, nvar);
 }
 #undef FAC
 /*-----------------------------------------------------------*/
@@ -2249,11 +2202,17 @@ void TwoPunctures::LineRelax_al(double *dv,
 {
  int i, m, Ic, Ip, Im, col, ivar;

-  double *diag = ws_diag_al;
-  double *e = ws_e_al;     /* above diagonal */
-  double *f = ws_f_al;     /* below diagonal */
-  double *b = ws_b_al;     /* rhs */
-  double *x = ws_x_al;     /* solution vector */
+  double *diag = new double[n1];
+  double *e = new double[n1 - 1]; /* above diagonal */
+  double *f = new double[n1 - 1]; /* below diagonal */
+  double *b = new double[n1];     /* rhs */
+  double *x = new double[n1];     /* solution vector */
+
+  //  gsl_vector *diag = gsl_vector_alloc(n1);
+  //  gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */
+  //  gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */
+  //  gsl_vector *b = gsl_vector_alloc(n1);   /* rhs */
+  //  gsl_vector *x = gsl_vector_alloc(n1);   /* solution vector */

  for (ivar = 0; ivar < nvar; ivar++)
  {
@@ -2263,35 +2222,57 @@ void TwoPunctures::LineRelax_al(double *dv,
    }
    diag[n1 - 1] = 0;

+    //    gsl_vector_set_zero(diag);
+    //    gsl_vector_set_zero(e);
+    //    gsl_vector_set_zero(f);
    for (i = 0; i < n1; i++)
    {
      Ip = Index(ivar, i + 1, j, k, nvar, n1, n2, n3);
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      Im = Index(ivar, i - 1, j, k, nvar, n1, n2, n3);
      b[i] = rhs[Ic];
+      //      gsl_vector_set(b,i,rhs[Ic]);
      for (m = 0; m < ncols[Ic]; m++)
      {
        col = cols[Ic][m];
        if (col != Ip && col != Ic && col != Im)
          b[i] -= JFD[Ic][m] * dv[col];
+        //          *gsl_vector_ptr(b, i) -= JFD[Ic][m] * dv[col];
        else
        {
          if (col == Im && i > 0)
            f[i - 1] = JFD[Ic][m];
+          //            gsl_vector_set(f,i-1,JFD[Ic][m]);
          if (col == Ic)
            diag[i] = JFD[Ic][m];
+          //            gsl_vector_set(diag,i,JFD[Ic][m]);
          if (col == Ip && i < n1 - 1)
            e[i] = JFD[Ic][m];
+          //            gsl_vector_set(e,i,JFD[Ic][m]);
        }
      }
    }
    ThomasAlgorithm(n1, f, diag, e, x, b);
+    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
    for (i = 0; i < n1; i++)
    {
      Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
      dv[Ic] = x[i];
+      //      dv[Ic] = gsl_vector_get(x, i);
    }
  }
+
+  delete[] diag;
+  delete[] e;
+  delete[] f;
+  delete[] b;
+  delete[] x;
+
+  //  gsl_vector_free(diag);
+  //  gsl_vector_free(e);
+  //  gsl_vector_free(f);
+  //  gsl_vector_free(b);
+  //  gsl_vector_free(x);
 }
 /* -------------------------------------------------------------------------*/
 // a[N], b[N-1], c[N-1], x[N], q[N]
@@ -2303,29 +2284,44 @@ void TwoPunctures::LineRelax_al(double *dv,
 //"Parallel Scientific Computing in C++ and MPI" P361
 void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q)
 {
-  // In-place Thomas algorithm: uses a[] as d workspace, b[] as l workspace.
-  // c[] is already u (above-diagonal). ws_thomas_y is pre-allocated workspace.
  int i;
-  double *y = ws_thomas_y;
+  double *l, *u, *d, *y;
+  l = new double[N - 1];
+  u = new double[N - 1];
+  d = new double[N];
+  y = new double[N];
+
+  /* LU Decomposition */
+  d[0] = a[0];
+  u[0] = c[0];

-  /* LU Decomposition (in-place: a becomes d, b becomes l) */
  for (i = 0; i < N - 2; i++)
  {
-    b[i] = b[i] / a[i];
-    a[i + 1] = a[i + 1] - b[i] * c[i];
+    l[i] = b[i] / d[i];
+    d[i + 1] = a[i + 1] - l[i] * u[i];
+    u[i + 1] = c[i + 1];
  }
-  b[N - 2] = b[N - 2] / a[N - 2];
-  a[N - 1] = a[N - 1] - b[N - 2] * c[N - 2];
+
+  l[N - 2] = b[N - 2] / d[N - 2];
+  d[N - 1] = a[N - 1] - l[N - 2] * u[N - 2];

  /* Forward Substitution [L][y] = [q] */
  y[0] = q[0];
  for (i = 1; i < N; i++)
-    y[i] = q[i] - b[i - 1] * y[i - 1];
+    y[i] = q[i] - l[i - 1] * y[i - 1];

  /* Backward Substitution [U][x] = [y] */
-  x[N - 1] = y[N - 1] / a[N - 1];
+  x[N - 1] = y[N - 1] / d[N - 1];
+
  for (i = N - 2; i >= 0; i--)
-    x[i] = (y[i] - c[i] * x[i + 1]) / a[i];
+    x[i] = (y[i] - u[i] * x[i + 1]) / d[i];
+
+  delete[] l;
+  delete[] u;
+  delete[] d;
+  delete[] y;
+
+  return;
 }
 // --------------------------------------------------------------------------*/
 // Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -42,33 +42,6 @@ private:

       int ntotal;

-       // Pre-allocated workspace buffers for hot-path allocation elimination
-       // LineRelax_be workspace (sized for n2)
-       double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be;
-       // LineRelax_al workspace (sized for n1)
-       double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
-       // ThomasAlgorithm workspace (sized for max(n1,n2))
-       double *ws_thomas_y;
-       // JFD_times_dv workspace (sized for nvar)
-       double *ws_jfd_values;
-       derivs ws_jfd_dU, ws_jfd_U;
-       // chebft_Zeros workspace (sized for max(n1,n2,n3)+1)
-       double *ws_cheb_c;
-       // fourft workspace (sized for max(n1,n2,n3)/2+1 each)
-       double *ws_four_a, *ws_four_b;
-       // Derivatives_AB3 workspace
-       double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
-       double *ws_deriv_q, *ws_deriv_dq;
-       double *ws_deriv_r, *ws_deriv_dr;
-       int *ws_deriv_indx;
-       // F_of_v workspace
-       double *ws_fov_sources;
-       double *ws_fov_values;
-       derivs ws_fov_U;
-       // J_times_dv workspace
-       double *ws_jtdv_values;
-       derivs ws_jtdv_dU, ws_jtdv_U;
-
       struct parameters
       {
              int nvar, n1, n2, n3;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
--- a/AMSS_NCKU_source/bssn_rhs_legacy.f90
+++ b/AMSS_NCKU_source/bssn_rhs_legacy.f90
--- a/AMSS_NCKU_source/bssn_rhs_opt.f90
+++ b/AMSS_NCKU_source/bssn_rhs_opt.f90
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -997,10 +997,10 @@
  fy = ZEO
  fz = ZEO

-#if 0
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
+#if 0  
 ! x direction   
        if(i+2 <= imax .and. i-2 >= imin)then
 !
@@ -1040,13 +1040,9 @@

 ! set kmax and kmin 0
    endif
-  enddo
-  enddo
-  enddo
 #elif 0
-  do k=1,ex(3)-1
-  do j=1,ex(2)-1
-  do i=1,ex(1)-1
+! x direction   
+        if(i+2 <= imax .and. i-2 >= imin)then
 !
 !              f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
 !  fx(i) = ---------------------------------------------
@@ -1083,32 +1079,8 @@

 ! set kmax and kmin 0
    endif
-  enddo
-  enddo
-  enddo
 #else
-! for bam comparison — split into branch-free interior + serial boundary
-! Interior: all stencil points guaranteed in-bounds, no branches needed
-  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
-  do k=max(3,1),min(ex(3)-1,kmax-2)
-  do j=max(3,1),min(ex(2)-1,jmax-2)
-  !DIR$ IVDEP
-  do i=max(3,1),min(ex(1)-1,imax-2)
-      fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
-      fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
-      fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
-  enddo
-  enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-! Boundary shell: original branching logic for points near edges
-  do k=1,ex(3)-1
-  do j=1,ex(2)-1
-  do i=1,ex(1)-1
-   if(i >= 3 .and. i <= imax-2 .and. &
-      j >= 3 .and. j <= jmax-2 .and. &
-      k >= 3 .and. k <= kmax-2) cycle
+! for bam comparison
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
@@ -1122,10 +1094,10 @@
      fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
      fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
   endif
-  enddo
-  enddo
-  enddo
 #endif
+  enddo
+  enddo
+  enddo

  return

@@ -1429,10 +1401,10 @@
  fxz = ZEO
  fyz = ZEO

-#if 0
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
+#if 0  
 !~~~~~~ fxx
        if(i+2 <= imax .and. i-2 >= imin)then
 !
@@ -1510,47 +1482,8 @@
   elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
   fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
   endif 
-  enddo
-  enddo
-  enddo
 #else
-! for bam comparison — split into branch-free interior + serial boundary
-! Interior: all stencil points guaranteed in-bounds, no branches needed
-  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
-  do k=max(3,1),min(ex(3)-1,kmax-2)
-  do j=max(3,1),min(ex(2)-1,jmax-2)
-  !DIR$ IVDEP
-  do i=max(3,1),min(ex(1)-1,imax-2)
-   fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-                       -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
-   fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-                       -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
-   fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-                       -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
-   fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
-                       -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
-                       +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
-                       -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
-   fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
-                       -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
-                       +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
-                       -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
-   fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
-                       -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
-                       +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
-                       -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
-  enddo
-  enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-! Boundary shell: original branching logic for points near edges
-  do k=1,ex(3)-1
-  do j=1,ex(2)-1
-  do i=1,ex(1)-1
-   if(i >= 3 .and. i <= imax-2 .and. &
-      j >= 3 .and. j <= jmax-2 .and. &
-      k >= 3 .and. k <= kmax-2) cycle
+! for bam comparison
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
@@ -1585,10 +1518,10 @@
   fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
   fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
   endif
-  enddo
-  enddo
-  enddo
 #endif
+   enddo
+   enddo
+   enddo

  return

--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -19,60 +19,48 @@

 !~~~~~~~> Local variable:
  
-  integer :: i,j,k
-  real*8 :: lgxx,lgyy,lgzz,ldetg
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-  real*8 :: ltrA,lscale
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0

 !~~~~~~>

-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
+  gxx = dxx + ONE
+  gyy = dyy + ONE
+  gzz = dzz + ONE

-    lgxx = dxx(i,j,k) + ONE
-    lgyy = dyy(i,j,k) + ONE
-    lgzz = dzz(i,j,k) + ONE
+  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
+  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
+  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
+  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
+  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
+  gupzz =   ( gxx * gyy - gxy * gxy ) / detg

-    ldetg =  lgxx * lgyy * lgzz &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
-           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
-           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
-           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
-           - lgxx * gyz(i,j,k) * gyz(i,j,k)
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)

-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+  Axx = Axx - F1o3 * gxx * trA
+  Axy = Axy - F1o3 * gxy * trA
+  Axz = Axz - F1o3 * gxz * trA
+  Ayy = Ayy - F1o3 * gyy * trA
+  Ayz = Ayz - F1o3 * gyz * trA
+  Azz = Azz - F1o3 * gzz * trA

-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
-                 + lgupzz * Azz(i,j,k) &
-         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
-                 + lgupyz * Ayz(i,j,k))
+  detg = ONE / ( detg ** F1o3 ) 
  
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+  gxx = gxx * detg
+  gxy = gxy * detg
+  gxz = gxz * detg
+  gyy = gyy * detg
+  gyz = gyz * detg
+  gzz = gzz * detg

-    lscale = ONE / ( ldetg ** F1o3 )
-
-    dxx(i,j,k) = lgxx * lscale - ONE
-    gxy(i,j,k) = gxy(i,j,k) * lscale
-    gxz(i,j,k) = gxz(i,j,k) * lscale
-    dyy(i,j,k) = lgyy * lscale - ONE
-    gyz(i,j,k) = gyz(i,j,k) * lscale
-    dzz(i,j,k) = lgzz * lscale - ONE
-
-  enddo
-  enddo
-  enddo
+  dxx = gxx - ONE
+  dyy = gyy - ONE
+  dzz = gzz - ONE

  return

@@ -95,70 +83,50 @@

 !~~~~~~~> Local variable:
  
-  integer :: i,j,k
-  real*8 :: lgxx,lgyy,lgzz,lscale
-  real*8 :: lgxy,lgxz,lgyz
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-  real*8 :: ltrA
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0

 !~~~~~~>

-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
+  gxx = dxx + ONE
+  gyy = dyy + ONE
+  gzz = dzz + ONE
+! for g
+  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz

-! for g: normalize determinant first
-    lgxx = dxx(i,j,k) + ONE
-    lgyy = dyy(i,j,k) + ONE
-    lgzz = dzz(i,j,k) + ONE
-    lgxy = gxy(i,j,k)
-    lgxz = gxz(i,j,k)
-    lgyz = gyz(i,j,k)
+  gupzz = ONE / ( gupzz ** F1o3 ) 
  
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+  gxx = gxx * gupzz
+  gxy = gxy * gupzz
+  gxz = gxz * gupzz
+  gyy = gyy * gupzz
+  gyz = gyz * gupzz
+  gzz = gzz * gupzz

-    lscale = ONE / ( lscale ** F1o3 )
+  dxx = gxx - ONE
+  dyy = gyy - ONE
+  dzz = gzz - ONE
+! for A  

-    lgxx = lgxx * lscale
-    lgxy = lgxy * lscale
-    lgxz = lgxz * lscale
-    lgyy = lgyy * lscale
-    lgyz = lgyz * lscale
-    lgzz = lgzz * lscale
+  gupxx =   ( gyy * gzz - gyz * gyz )
+  gupxy = - ( gxy * gzz - gyz * gxz )
+  gupxz =   ( gxy * gyz - gyy * gxz )
+  gupyy =   ( gxx * gzz - gxz * gxz )
+  gupyz = - ( gxx * gyz - gxy * gxz )
+  gupzz =   ( gxx * gyy - gxy * gxy )

-    dxx(i,j,k) = lgxx - ONE
-    gxy(i,j,k) = lgxy
-    gxz(i,j,k) = lgxz
-    dyy(i,j,k) = lgyy - ONE
-    gyz(i,j,k) = lgyz
-    dzz(i,j,k) = lgzz - ONE
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)

-! for A: trace-free using normalized metric (det=1, no division needed)
-    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
-    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
-    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
-    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
-    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
-    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
-
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
-                 + lgupzz * Azz(i,j,k) &
-         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
-                 + lgupyz * Ayz(i,j,k))
-
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
-
-  enddo
-  enddo
-  enddo
+  Axx = Axx - F1o3 * gxx * trA
+  Axy = Axy - F1o3 * gxy * trA
+  Axz = Axz - F1o3 * gxz * trA
+  Ayy = Ayy - F1o3 * gyy * trA
+  Ayz = Ayz - F1o3 * gyz * trA
+  Azz = Azz - F1o3 * gzz * trA

  return

--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)

  integer::i

+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)

  integer::i

+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)

  integer::i

+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -881,18 +884,10 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc
  real*8, dimension(1:3), intent(in) :: SoA

-  integer::i,j,k
-
-  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
-  do k=1,extc(3)
-  do j=1,extc(2)
-  do i=1,extc(1)
-     funcc(i,j,k) = func(i,j,k)
-  enddo
-  enddo
-  enddo
-  !$OMP END PARALLEL DO
+  integer::i

+  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
   enddo
@@ -917,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)

  integer::i

+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -945,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)

  integer::i

+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1121,65 +1118,64 @@ end subroutine d2dump
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------

-  subroutine polint(xa, ya, x, y, dy, ordn)
+  subroutine polint(xa,ya,x,y,dy,ordn)
+
  implicit none

-  integer, intent(in) :: ordn
-  real*8, dimension(ordn), intent(in) :: xa, ya
+!~~~~~~> Input Parameter:
+  integer,intent(in) :: ordn
+  real*8, dimension(ordn), intent(in) :: xa,ya
  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
+  real*8, intent(out) :: y,dy

-  integer :: i, m, ns, n_m
-  real*8, dimension(ordn) :: c, d, ho
-  real*8 :: dif, dift, hp, h, den_val
+!~~~~~~> Other parameter:

-  c = ya
-  d = ya
-  ho = xa - x
+  integer :: m,n,ns
+  real*8, dimension(ordn) :: c,d,den,ho
+  real*8 :: dif,dift

-  ns = 1
-  dif = abs(x - xa(1))
+!~~~~~~>

-  do i = 2, ordn
-    dift = abs(x - xa(i))
-    if (dift < dif) then
-      ns = i
-      dif = dift
+  n=ordn
+  m=ordn
+
+  c=ya
+  d=ya
+  ho=xa-x
+
+  ns=1
+  dif=abs(x-xa(1))
+  do m=1,n
+   dift=abs(x-xa(m))
+   if(dift < dif) then
+    ns=m
+    dif=dift
   end if
  end do

-  y = ya(ns)
-  ns = ns - 1
-
-  do m = 1, ordn - 1
-    n_m = ordn - m
-    do i = 1, n_m
-      hp = ho(i)
-      h  = ho(i+m)
-      den_val = hp - h
-
-      if (den_val == 0.0d0) then
+  y=ya(ns)
+  ns=ns-1
+  do m=1,n-1
+    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
+    if (any(den(1:n-m) == 0.0))then
      write(*,*) 'failure in polint for point',x
      write(*,*) 'with input points: ',xa
      stop
-      end if
-
-      den_val = (c(i+1) - d(i)) / den_val
-
-      d(i) = h * den_val
-      c(i) = hp * den_val
-    end do
-
-    if (2 * ns < n_m) then
-      dy = c(ns + 1)
+    endif
+    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
+    d(1:n-m)=ho(1+m:n)*den(1:n-m)
+    c(1:n-m)=ho(1:n-m)*den(1:n-m)
+    if (2*ns < n-m) then
+      dy=c(ns+1)
    else
-      dy = d(ns)
-      ns = ns - 1
+      dy=d(ns)
+      ns=ns-1
    end if
-    y = y + dy
+    y=y+dy
  end do

  return
+
  end subroutine polint
 !------------------------------------------------------------------------------
 !
@@ -1187,37 +1183,35 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
+
  implicit none

+!~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy

-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
+
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp

  m=size(x1a)
+  
  do i=1,m
+
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
-  end do
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
-#else
-  integer  :: j
-  real*8, dimension(ordn) :: ymtmp
-  real*8 :: dy_temp

-  do j=1,ordn
-    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
-#endif
+
+  call polint(x1a,ymtmp,x1,y,dy,ordn)

  return
+
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
@@ -1225,15 +1219,18 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
+
  implicit none

+!~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy

-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
+
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1242,33 +1239,24 @@ end subroutine d2dump

  m=size(x1a)
  n=size(x2a)
+  
  do i=1,m
   do j=1,n
+
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
+
   end do
+
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
-  end do
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
-#else
-  integer  :: j, k
-  real*8, dimension(ordn,ordn) :: yatmp
-  real*8, dimension(ordn) :: ymtmp
-  real*8 :: dy_temp

-  do k=1,ordn
-    do j=1,ordn
-      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
  end do
-  end do
-  do k=1,ordn
-    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
-  end do
-  call polint(x3a, ymtmp, x3, y, dy, ordn)
-#endif
+
+  call polint(x1a,ymtmp,x1,y,dy,ordn)

  return
+
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -159,42 +159,36 @@ integer, parameter :: NO_SYMM=0, OCTANT=2

  call symmetry_bd(3,ex,f,fh,SoA)

-! Interior: all stencil points guaranteed in-bounds
-  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
-  do k=4,ex(3)-3
-  do j=4,ex(2)-3
-  !DIR$ IVDEP
-  do i=4,ex(1)-3
-   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
-                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
-                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
-                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
-                          TWT* fh(i,j,k)            )/dX + &
-                                                  (     &
-                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
-                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
-                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
-                          TWT* fh(i,j,k)            )/dY + &
-                                                  (     &
-                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
-                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
-                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
-                          TWT* fh(i,j,k)            )/dZ )
-  enddo
-  enddo
-  enddo
-  !$OMP END PARALLEL DO
-
-! Boundary shell: original branching logic for points near edges
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
-  if(i >= 4 .and. i <= ex(1)-3 .and. &
-     j >= 4 .and. j <= ex(2)-3 .and. &
-     k >= 4 .and. k <= ex(3)-3) cycle
+
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
+#if 0     
+! x direction
+   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dX/cof * (     &
+                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
+                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
+                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
+                          TWT* fh(i,j,k)            )
+! y direction
+
+   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dY/cof * (     &
+                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
+                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
+                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
+                          TWT* fh(i,j,k)            )
+! z direction
+
+   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dZ/cof * (     &
+                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
+                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
+                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
+                          TWT* fh(i,j,k)            )
+#else
+! calculation order if important ?
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
@@ -210,7 +204,9 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
+#endif
  endif
+
  enddo
  enddo
  enddo
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -233,7 +233,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)

 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
-  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -483,7 +482,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  enddo
  enddo
  enddo
-  !$OMP END PARALLEL DO

  return

--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o

 F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -7,8 +7,9 @@
 filein  = -I/usr/include/ -I${MKLROOT}/include

 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
+LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
+          -lpthread -lm -ldl

 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
@@ -16,10 +17,10 @@ LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lifco
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
 ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-               -align array64byte -fpp -I${MKLROOT}/include
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
+               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -15,7 +15,7 @@ import subprocess
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
-NUMACTL_CPU_BIND = "taskset -c 0-111"
+NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"

 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
Author	SHA1	Message	Date
CGH0S7	ed89bc029b	Fix potential division by zero in reta_val calculation and enable NaN checks Added a safety check for the denominator in the reta_val calculation to prevent division by zero when chi approaches zero (e.g., at far-field boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early. Initialized output arrays to zero to prevent uninitialized memory access.	2026-01-19 20:29:48 +08:00
CGH0S7	19274e93d1	Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs Refactored calc_derivs and calc_dderivs to include correct boundary handling logic matching the legacy code. Implemented fallback to 2nd order derivatives when near boundaries where 4th order stencils cannot be used. Added logic to initialize output arrays to zero to avoid uninitialized memory access.	2026-01-19 20:03:22 +08:00
CGH0S7	ae1a474cca	Fix compilation errors and complete logic in BSSN RHS optimization	2026-01-19 19:22:52 +08:00
CGH0S7	cbb8fb3a87	patched last commit	2026-01-19 17:14:28 +08:00
CGH0S7	4472d89a9f	Optimize bssn_rhs calculation with cache blocking and vectorization - Implemented cache blocking (BLK=8) in bssn_rhs_opt.f90 to improve L1/L2 cache hit rate. - Introduced bssn_rhs_opt.f90 module with vectorized derivative and physics kernels. - Renamed original implementation to bssn_rhs_legacy.f90 for fallback. - Updated bssn_rhs.f90 to act as a dispatcher, using the optimized path for ghost_width=3. - Updated makefile to include new source files. - Added DEBUG_NAN_CHECK macro to optionally disable NaN checks in production.	2026-01-19 16:39:24 +08:00