Stabilize EScalar CUDA fallback path

Stabilize EScalar CUDA sync defaults
Pin EScalar scalar CUDA transfers
2026-05-03 16:05:47 +08:00 · 2026-05-03 00:24:50 +08:00 · 2026-05-02 19:21:57 +08:00 · 2026-05-02 18:38:43 +08:00 · 2026-05-02 18:27:26 +08:00 · 2026-05-02 18:19:15 +08:00
40 changed files with 41895 additions and 29733 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,9 +16,9 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 2                             ## number of mpi processes used in the simulation
-GPU_Calculation  = "no"                          ## Use GPU or not 
+GPU_Calculation  = "yes"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
@@ -31,7 +31,7 @@ GPU_Part         = 0.0
 ## Setting the physical system and numerical method
 Symmetry                 = "equatorial-symmetry"   ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
-Equation_Class           = "BSSN"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
+Equation_Class           = "BSSN-EScalar"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"
                                                   ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -58,31 +58,36 @@ File_directory = os.path.join(input_data.File_directory)
 ## If the specified output directory exists, ask the user whether to continue
 if os.path.exists(File_directory):
-    print( " Output dictionary has been existed !!!  "                                                              )
+    auto_overwrite = str(getattr(input_data, "Auto_Overwrite_Output", "yes")).strip().lower()
-    print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
+    if auto_overwrite in ("1", "yes", "y", "true", "on", "continue"):
-    print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
+        print( " Output dictionary has been existed; Auto_Overwrite_Output=yes, continue the calculation. " )
-    print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
+        print(                                                                                              )
-    print(                                                                                                          )
+    else:
-    ## Prompt whether to overwrite the existing directory
+        print( " Output dictionary has been existed !!!  "                                                              )
-    while True:
+        print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
-        try:
+        print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
-            inputvalue = input()
+        print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
-            ## If the user agrees to overwrite, proceed and remove the existing directory
+        print(                                                                                                          )
-            if ( inputvalue == "continue" ):
+        ## Prompt whether to overwrite the existing directory
-                print( " Continue the calculation !!! " )
+        while True:
-                print(                                  )
+            try:
-                break  
+                inputvalue = input()
-            ## If the user chooses not to overwrite, exit and keep the existing directory
+                ## If the user agrees to overwrite, proceed and remove the existing directory
-            elif ( inputvalue == "stop" ):
+                if ( inputvalue == "continue" ):
-                print( " Stop the calculation !!! "    )
+                    print( " Continue the calculation !!! " )
-                sys.exit() 
+                    print(                                  )
-            ## If the user input is invalid, prompt again
+                    break  
-            else:
+                ## If the user chooses not to overwrite, exit and keep the existing directory
                elif ( inputvalue == "stop" ):
                    print( " Stop the calculation !!! "    )
                    sys.exit() 
                ## If the user input is invalid, prompt again
                else:
                    print( " Please input your choice !!! "                   )
                    print( " Input 'continue' or 'stop' in the terminal !!! " )
            except ValueError:
                print( " Please input your choice !!! "                   )
                print( " Input 'continue' or 'stop' in the terminal !!! " )
        except ValueError:
            print( " Please input your choice !!! "                   )
            print( " Input 'continue' or 'stop' in the terminal !!! " )
 ## Remove the existing output directory if present
 shutil.rmtree(File_directory, ignore_errors=True)
@@ -174,14 +179,11 @@ import generate_macrodef
 generate_macrodef.generate_macrodef_h()
 print( " AMSS-NCKU macro file macrodef.h has been generated. " )
-generate_macrodef.generate_macrodef_fh()
+generate_macrodef.generate_macrodef_fh()
-print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
+print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
-
+
-generate_macrodef.generate_build_config()
+
-print( " AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. " )
+##################################################################
 ##################################################################
 # Compile the AMSS-NCKU program according to user requirements
@@ -220,13 +222,11 @@ shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
 # Copy the generated macro files into the AMSS_NCKU source folder
-macrodef_h_path  = os.path.join(File_directory, "macrodef.h")
+macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
-macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
+macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
-build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
+
-
+shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
-shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
+shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
 shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 # Notes on copying files:
 # shutil.copy2 preserves file metadata such as modification time.
@@ -263,7 +263,7 @@ print()
 if (input_data.GPU_Calculation == "no"):
    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
 elif (input_data.GPU_Calculation == "yes"):
-    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
+    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE_CUDA")
 if not os.path.exists( ABE_file ):
    print(                                                                                                  )
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -6,14 +6,68 @@
 #include <cstdio>
 #include <string>
 #include <cmath>
-#include <new>
+#include <new>
-using namespace std;
+using namespace std;
-
+
-#include "Block.h"
+#include "Block.h"
-#include "misc.h"
+#include "misc.h"
-
+
-Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
-{
+#include <cuda_runtime_api.h>
 #endif
 namespace {
 bool cuda_pin_gridfuncs_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double *alloc_gridfunc(size_t count, unsigned char &pinned)
 {
  pinned = 0;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (cuda_pin_gridfuncs_enabled())
  {
    double *ptr = 0;
    cudaError_t err = cudaMallocHost((void **)&ptr, count * sizeof(double));
    if (err == cudaSuccess)
    {
      pinned = 1;
      return ptr;
    }
    cudaGetLastError();
  }
 #endif
  return (double *)malloc(sizeof(double) * count);
 }
 void free_gridfunc(double *ptr, unsigned char pinned)
 {
  if (!ptr)
    return;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (pinned)
  {
    cudaFreeHost(ptr);
    return;
  }
 #else
  (void)pinned;
 #endif
  free(ptr);
 }
 }
 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), lev(levi), cgpu(cgpui), ingfs(ingfsi), fngfs(fngfsi), igfs(0), fgfs(0), fgfs_pinned(0)
 {
  for (int i = 0; i < dim; i++)
    X[i] = 0;
@@ -68,14 +122,15 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
 #endif
    }
-    int nn = shape[0] * shape[1] * shape[2];
+    int nn = shape[0] * shape[1] * shape[2];
-    fgfs = new double *[fngfs];
+    fgfs = new double *[fngfs];
-    for (int i = 0; i < fngfs; i++)
+    fgfs_pinned = new unsigned char[fngfs];
-    {
+    for (int i = 0; i < fngfs; i++)
-      fgfs[i] = (double *)malloc(sizeof(double) * nn);
+    {
-      if (!(fgfs[i]))
+      fgfs[i] = alloc_gridfunc((size_t)nn, fgfs_pinned[i]);
-      {
+      if (!(fgfs[i]))
-        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
+      {
        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      memset(fgfs[i], 0, sizeof(double) * nn);
@@ -103,17 +158,19 @@ Block::~Block()
  {
    for (int i = 0; i < dim; i++)
      delete[] X[i];
-    for (int i = 0; i < ingfs; i++)
+    for (int i = 0; i < ingfs; i++)
-      free(igfs[i]);
+      free(igfs[i]);
-    delete[] igfs;
+    delete[] igfs;
-    for (int i = 0; i < fngfs; i++)
+    for (int i = 0; i < fngfs; i++)
-      free(fgfs[i]);
+      free_gridfunc(fgfs[i], fgfs_pinned ? fgfs_pinned[i] : 0);
-    delete[] fgfs;
+    delete[] fgfs;
-    X[0] = X[1] = X[2] = 0;
+    delete[] fgfs_pinned;
-    igfs = 0;
+    X[0] = X[1] = X[2] = 0;
-    fgfs = 0;
+    igfs = 0;
-  }
+    fgfs = 0;
-}
+    fgfs_pinned = 0;
  }
 }
 void Block::checkBlock()
 {
  int myrank;
@@ -184,12 +241,14 @@ void Block::swapList(MyList<var> *VarList1, MyList<var> *VarList2, int myrank)
  if (rank == myrank)
  {
    MyList<var> *varl1 = VarList1, *varl2 = VarList2;
-    while (varl1 && varl2)
+    while (varl1 && varl2)
-    {
+    {
-      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
+      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
-      varl1 = varl1->next;
+      if (fgfs_pinned)
-      varl2 = varl2->next;
+        misc::swap<unsigned char>(fgfs_pinned[varl1->data->sgfn], fgfs_pinned[varl2->data->sgfn]);
-    }
+      varl1 = varl1->next;
      varl2 = varl2->next;
    }
    if (varl1 || varl2)
    {
      cout << "error in Block::swaplist, var lists does not match." << endl;
--- a/AMSS_NCKU_source/Block.h
+++ b/AMSS_NCKU_source/Block.h
@@ -13,14 +13,15 @@ public:
   int shape[dim];
   double bbox[2 * dim];
   double *X[dim];
-   int rank; // where the real data locate in
+   int rank; // where the real data locate in
-   int lev, cgpu;
+   int lev, cgpu;
-   int ingfs, fngfs;
+   int ingfs, fngfs;
-   int *(*igfs);
+   int *(*igfs);
-   double *(*fgfs);
+   double *(*fgfs);
   unsigned char *fgfs_pinned;
 public:
-   Block() {};
+   Block() : rank(0), lev(0), cgpu(0), ingfs(0), fngfs(0), igfs(0), fgfs(0), fgfs_pinned(0) {};
   Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfs, int levi, const int cgpui = 0);
   ~Block();
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -11,12 +11,15 @@
 using namespace std;
 #include "misc.h"
-#include "MPatch.h"
+#include "MPatch.h"
-#include "Parallel.h"
+#include "Parallel.h"
-#include "fmisc.h"
+#include "fmisc.h"
-#ifdef INTERP_LB_PROFILE
+#if USE_CUDA_BSSN
-#include "interp_lb_profile.h"
+#include "bssn_rhs_cuda.h"
-#endif
+#endif
 #ifdef INTERP_LB_PROFILE
 #include "interp_lb_profile.h"
 #endif
 namespace
 {
@@ -154,8 +157,8 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
  index.valid = true;
 }
-int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
+int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
-{
+{
  if (!index.valid)
    return -1;
@@ -175,10 +178,448 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
  for (size_t bi = 0; bi < index.views.size(); bi++)
    if (point_in_block_view(index.views[bi], pox, DH))
      return int(bi);
-
+
-  return -1;
+  return -1;
-}
+}
-} // namespace
+
 inline int fortran_idint_local(double x)
 {
  return int(x);
 }
 bool interp_fast_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_gpu_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_GPU");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_fast_compare_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double interp_fast_compare_tol()
 {
  static double tol = -1.0;
  if (tol < 0.0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL");
    tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11;
  }
  return tol;
 }
 long long interp_fast_compare_limit()
 {
  static long long limit = -1;
  if (limit < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT");
    limit = (env && atoll(env) > 0) ? atoll(env) : 4096;
  }
  return limit;
 }
 struct FastInterpStencil
 {
  int cxB[dim];
  double cx[dim];
  double wx[8];
  double wy[8];
  double wz[8];
  int nsamples;
  int loc[512];
  unsigned char sign_mask[512];
  double weight[512];
 };
 inline void lagrange_unit_weights(double x, int ordn, double *w)
 {
  for (int i = 0; i < ordn; i++)
  {
    double num = 1.0;
    double den = 1.0;
    for (int j = 0; j < ordn; j++)
    {
      if (j == i)
        continue;
      num *= (x - double(j));
      den *= double(i - j);
    }
    w[i] = num / den;
  }
 }
 inline void z_unit_weights(double x, int ordn, double *w)
 {
  if (ordn == 6)
  {
    static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0};
    for (int i = 0; i < 6; i++)
    {
      if (x == double(i))
      {
        for (int j = 0; j < 6; j++)
          w[j] = (j == i) ? 1.0 : 0.0;
        return;
      }
    }
    double den = 0.0;
    for (int i = 0; i < 6; i++)
    {
      w[i] = c_uniform[i] / (x - double(i));
      den += w[i];
    }
    for (int i = 0; i < 6; i++)
      w[i] /= den;
    return;
  }
  lagrange_unit_weights(x, ordn, w);
 }
 inline bool fast_interp_map_index(int idx, int extent, int d,
                                  int &mapped, unsigned char &mask)
 {
  if (idx > 0)
    mapped = idx;
  else
  {
    mask |= (unsigned char)(1u << d);
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    mapped = 2 - idx;
 #else
 #ifdef Cell
    mapped = 1 - idx;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
  }
  return mapped >= 1 && mapped <= extent;
 }
 bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn,
                                 int Symmetry, FastInterpStencil &st)
 {
  if (!BP || ordn <= 0 || ordn > 8)
    return false;
  st.nsamples = 0;
  const int NO_SYMM = 0;
  const int OCTANT = 2;
  int cmin[dim], cmax[dim], cxT[dim];
  for (int d = 0; d < dim; d++)
  {
    const double *X = BP->X[d];
    const double dX = X[1] - X[0];
    const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1;
    st.cxB[d] = cxI - ordn / 2 + 1;
    cxT[d] = st.cxB[d] + ordn - 1;
    cmin[d] = 1;
    cmax[d] = BP->shape[d];
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
 #else
 #ifdef Cell
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (st.cxB[d] < cmin[d])
    {
      st.cxB[d] = cmin[d];
      cxT[d] = st.cxB[d] + ordn - 1;
    }
    if (cxT[d] > cmax[d])
    {
      cxT[d] = cmax[d];
      st.cxB[d] = cxT[d] + 1 - ordn;
    }
    if (st.cxB[d] > 0)
      st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX;
    else
    {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
      st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX;
 #else
 #ifdef Cell
      st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    }
  }
  lagrange_unit_weights(st.cx[0], ordn, st.wx);
  lagrange_unit_weights(st.cx[1], ordn, st.wy);
  z_unit_weights(st.cx[2], ordn, st.wz);
  for (int kk = 0; kk < ordn; kk++)
  {
    for (int jj = 0; jj < ordn; jj++)
    {
      for (int ii = 0; ii < ordn; ii++)
      {
        unsigned char mask = 0;
        int ix, iy, iz;
        if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) ||
            !fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) ||
            !fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask))
          return false;
        const int s = st.nsamples++;
        st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] +
                    (iz - 1) * BP->shape[0] * BP->shape[1];
        st.sign_mask[s] = mask;
        st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk];
      }
    }
  }
  return true;
 }
 bool interpolate_var_list_with_stencil(Block *BP, MyList<var> *VarList,
                                       int num_var, const double *pox,
                                       int ordn, int Symmetry,
                                       const FastInterpStencil &st,
                                       double *out)
 {
  if (num_var <= 0 || num_var > 128)
    return false;
  double *data_ptrs[128];
  double *soa_ptrs[128];
  var *vars[128];
  MyList<var> *varl = VarList;
  int k = 0;
  while (varl)
  {
    if (k >= num_var)
      return false;
    vars[k] = varl->data;
    data_ptrs[k] = BP->fgfs[vars[k]->sgfn];
    soa_ptrs[k] = vars[k]->SoA;
    out[k] = 0.0;
    varl = varl->next;
    k++;
  }
  if (k != num_var)
    return false;
  for (int s = 0; s < st.nsamples; s++)
  {
    const int loc = st.loc[s];
    const double w = st.weight[s];
    const unsigned char mask = st.sign_mask[s];
    if (mask == 0)
    {
      for (int v = 0; v < num_var; v++)
        out[v] += w * data_ptrs[v][loc];
    }
    else
    {
      for (int v = 0; v < num_var; v++)
      {
        const double *SoA = soa_ptrs[v];
        double sgn = 1.0;
        if (mask & 1u)
          sgn *= SoA[0];
        if (mask & 2u)
          sgn *= SoA[1];
        if (mask & 4u)
          sgn *= SoA[2];
        out[v] += w * sgn * data_ptrs[v][loc];
      }
    }
  }
  if (interp_fast_compare_enabled())
  {
    static int report_count = 0;
    static long long compare_calls = 0;
    if (compare_calls++ >= interp_fast_compare_limit())
      return true;
    const double tol = interp_fast_compare_tol();
    varl = VarList;
    k = 0;
    while (varl)
    {
      var *vp = vars[k];
      double ref = 0.0;
      double x = pox[0], y = pox[1], z = pox[2];
      f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
                      BP->fgfs[vp->sgfn], ref,
                      x, y, z, ordn, vp->SoA, Symmetry);
      const double diff = fabs(ref - out[k]);
      const double scale = 1.0 + fabs(ref);
      if (diff > tol * scale && report_count < 32)
      {
        int rank = 0;
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        fprintf(stderr,
                "[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n",
                rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]);
        report_count++;
      }
      varl = varl->next;
      k++;
    }
  }
  return true;
 }
 bool interpolate_var_list_fast(Block *BP, MyList<var> *VarList, int num_var,
                               const double *pox, int ordn, int Symmetry,
                               double *out)
 {
  if (!interp_fast_enabled())
    return false;
  FastInterpStencil st;
  if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st))
    return false;
  return interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
                                           ordn, Symmetry, st, out);
 }
 struct CachedInterpPoint
 {
  Block *bp;
  int owner_rank;
  FastInterpStencil stencil;
 };
 struct SurfaceInterpCache
 {
  Patch *patch;
  int NN;
  int symmetry;
  double key[9];
  vector<CachedInterpPoint> points;
  SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {}
 };
 bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch,
                               int NN, double **XX, int Symmetry)
 {
  if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry ||
      int(cache.points.size()) != NN || NN <= 0)
    return false;
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      if (cache.key[p++] != XX[d][ids[q]])
        return false;
  return true;
 }
 SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX,
                                       int Symmetry)
 {
  static vector<SurfaceInterpCache> caches;
  for (size_t i = 0; i < caches.size(); i++)
    if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry))
      return &caches[i];
  if (caches.size() >= 24)
    caches.erase(caches.begin());
  caches.push_back(SurfaceInterpCache());
  return &caches.back();
 }
 bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN,
                         double **XX, int Symmetry, const double *DH,
                         const BlockBinIndex &block_index, int ordn)
 {
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  cache.patch = patch;
  cache.NN = NN;
  cache.symmetry = Symmetry;
  cache.points.clear();
  cache.points.resize(NN);
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      cache.key[p++] = XX[d][ids[q]];
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int d = 0; d < dim; d++)
      pox[d] = XX[d][j];
    const int block_i = find_block_index_for_point(block_index, pox, DH);
    if (block_i < 0)
    {
      cache.points[j].bp = 0;
      cache.points[j].owner_rank = -1;
      continue;
    }
    Block *BP = block_index.views[block_i].bp;
    cache.points[j].bp = BP;
    cache.points[j].owner_rank = BP->rank;
    cache.points[j].stencil.nsamples = 0;
    if (BP->rank == myrank)
    {
      if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry,
                                       cache.points[j].stencil))
        return false;
    }
  }
  return true;
 }
 } // namespace
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -561,22 +1002,26 @@ void Patch::Interp_Points(MyList<var> *VarList,
    if (block_i >= 0)
    {
      Block *BP = block_index.views[block_i].bp;
-      owner_rank[j] = BP->rank;
+      owner_rank[j] = BP->rank;
-      if (myrank == BP->rank)
+      if (myrank == BP->rank)
-      {
+      {
-        //---> interpolation
+        //---> interpolation
-        varl = VarList;
+        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
-        int k = 0;
+                                       Symmetry, Shellf + j * num_var))
-        while (varl) // run along variables
+        {
-        {
+          varl = VarList;
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          int k = 0;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          while (varl) // run along variables
-          varl = varl->next;
+          {
-          k++;
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-        }
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-      }
+            varl = varl->next;
-    }
+            k++;
-  }
+          }
        }
      }
    }
  }
  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
@@ -659,10 +1104,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  // owner_rank[j] records which MPI rank owns point j
-
+  int *owner_rank;
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
@@ -670,12 +1113,117 @@ void Patch::Interp_Points(MyList<var> *VarList,
  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
-  BlockBinIndex block_index;
+  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
+  build_block_bin_index(this, DH, block_index);
-
+  SurfaceInterpCache *surface_cache = 0;
-  // --- Interpolation phase (identical to original) ---
+  bool use_surface_cache = false;
-  for (int j = 0; j < NN; j++)
+  if (interp_fast_enabled())
-  {
+  {
    surface_cache = find_surface_cache(this, NN, XX, Symmetry);
    use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry);
    if (!use_surface_cache)
      use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX,
                                              Symmetry, DH, block_index, ordn);
  }
  // --- Interpolation phase (identical to original) ---
 #if USE_CUDA_BSSN
  const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
                              VarList && VarList->next && !VarList->next->next;
 #else
  const bool use_gpu_interp = false;
 #endif
  if (use_gpu_interp)
  {
 #if USE_CUDA_BSSN
    vector<vector<int> > local_points(block_index.views.size());
    for (int j = 0; j < NN; j++)
    {
      for (int i = 0; i < dim; i++)
      {
        if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
        {
          cout << "Patch::Interp_Points: point (";
          for (int k = 0; k < dim; k++)
          {
            cout << XX[k][j];
            if (k < dim - 1)
              cout << ",";
            else
              cout << ") is out of current Patch." << endl;
          }
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      CachedInterpPoint &cp = surface_cache->points[j];
      Block *BP = cp.bp;
      owner_rank[j] = cp.owner_rank;
      if (BP && myrank == BP->rank)
      {
        for (size_t bi = 0; bi < block_index.views.size(); bi++)
        {
          if (block_index.views[bi].bp == BP)
          {
            local_points[bi].push_back(j);
            break;
          }
        }
      }
    }
    var *v0 = VarList->data;
    var *v1 = VarList->next->data;
    double soa6[6] = {
        v0->SoA[0], v0->SoA[1], v0->SoA[2],
        v1->SoA[0], v1->SoA[1], v1->SoA[2]};
    for (size_t bi = 0; bi < local_points.size(); bi++)
    {
      const int count = int(local_points[bi].size());
      if (count <= 0)
        continue;
      Block *BP = block_index.views[bi].bp;
      vector<double> px(count), py(count), pz(count), out(2 * count);
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        px[q] = XX[0][j];
        py[q] = XX[1][j];
        pz[q] = XX[2][j];
      }
      const double dx = BP->X[0][1] - BP->X[0][0];
      const double dy = BP->X[1][1] - BP->X[1][0];
      const double dz = BP->X[2][1] - BP->X[2][0];
      const int ok = bssn_cuda_interp_host_two_fields(
          BP, BP->shape,
          BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
          BP->X[0][0], BP->X[1][0], BP->X[2][0],
          dx, dy, dz,
          &px[0], &py[0], &pz[0], count,
          ordn, Symmetry, soa6, &out[0]);
      if (ok != 0)
      {
        if (myrank == 0)
          cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        Shellf[j * num_var] = out[2 * q];
        Shellf[j * num_var + 1] = out[2 * q + 1];
      }
    }
 #endif
  }
  else
  {
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int i = 0; i < dim; i++)
    {
@@ -692,28 +1240,59 @@ void Patch::Interp_Points(MyList<var> *VarList,
            cout << ") is out of current Patch." << endl;
        }
        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      }
-    }
+    }
-
+
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (use_surface_cache)
-    if (block_i >= 0)
+    {
-    {
+      CachedInterpPoint &cp = surface_cache->points[j];
-      Block *BP = block_index.views[block_i].bp;
+      Block *BP = cp.bp;
-      owner_rank[j] = BP->rank;
+      owner_rank[j] = cp.owner_rank;
-      if (myrank == BP->rank)
+      if (BP && myrank == BP->rank)
-      {
+      {
-        varl = VarList;
+        if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
-        int k = 0;
+                                               ordn, Symmetry, cp.stencil,
-        while (varl)
+                                               Shellf + j * num_var))
-        {
+        {
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          MyList<var> *varl_fallback = VarList;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          int k = 0;
-          varl = varl->next;
+          while (varl_fallback)
-          k++;
+          {
-        }
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
-      }
+                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
-    }
+            varl_fallback = varl_fallback->next;
-  }
+            k++;
          }
        }
      }
    }
    else
    {
      const int block_i = find_block_index_for_point(block_index, pox, DH);
      if (block_i >= 0)
      {
        Block *BP = block_index.views[block_i].bp;
        owner_rank[j] = BP->rank;
      if (myrank == BP->rank)
      {
        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
                                       Symmetry, Shellf + j * num_var))
        {
          MyList<var> *varl_fallback = VarList;
          int k = 0;
          while (varl_fallback)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
            varl_fallback = varl_fallback->next;
            k++;
          }
        }
      }
      }
    }
  }
  }
 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
@@ -965,22 +1544,26 @@ void Patch::Interp_Points(MyList<var> *VarList,
    if (block_i >= 0)
    {
      Block *BP = block_index.views[block_i].bp;
-      owner_rank[j] = BP->rank;
+      owner_rank[j] = BP->rank;
-      if (myrank == BP->rank)
+      if (myrank == BP->rank)
-      {
+      {
-        //---> interpolation
+        //---> interpolation
-        varl = VarList;
+        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
-        int k = 0;
+                                       Symmetry, Shellf + j * num_var))
-        while (varl) // run along variables
+        {
-        {
+          varl = VarList;
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          int k = 0;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          while (varl) // run along variables
-          varl = varl->next;
+          {
-          k++;
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-        }
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-      }
+            varl = varl->next;
-    }
+            k++;
-  }
+          }
        }
      }
    }
  }
  // Collect unique global owner ranks and translate to local ranks in Comm_here
  // Then broadcast each owner's points via MPI_Bcast on Comm_here
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -104,6 +104,14 @@ namespace Parallel
    double **recv_bufs;
    int *send_buf_caps;
    int *recv_buf_caps;
    unsigned char *send_buf_pinned;
    unsigned char *recv_buf_pinned;
    unsigned char *send_buf_is_dev;
    unsigned char *recv_buf_is_dev;
    int *send_buf_caps_dev;
    int *recv_buf_caps_dev;
    double **send_bufs_dev;
    double **recv_bufs_dev;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
@@ -111,12 +119,14 @@ namespace Parallel
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    bool cuda_aware_mode;
    SyncCache();
    void invalidate();
    void destroy();
  };
  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
  void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
@@ -179,13 +189,13 @@ namespace Parallel
  MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
  MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
  MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
-  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
+                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
-  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  double L2Norm(Patch *Pat, var *vf);
+  double L2Norm(Patch *Pat, var *vf);
-  void L2Norm7(Patch *Pat, var **vf, double *norms);
+  void L2Norm7(Patch *Pat, var **vf, double *norms);
-  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
+  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
-  void checkvarl(MyList<var> *pp, bool first_only);
+  void checkvarl(MyList<var> *pp, bool first_only);
  MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
  MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
  void prepare_inter_time_level(Patch *Pat,
@@ -217,12 +227,12 @@ namespace Parallel
  void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
  bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
  void checkpatchlist(MyList<Patch> *PatL, bool buflog);
-
+
-  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
+  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
-  void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
+  void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
-  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-                             int NN, double **XX,
+                             int NN, double **XX,
-                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
+                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
 #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                            bool periodic, int start_rank, int end_rank, int nodes = 0);
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,7 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
-#include <cblas.h>
+#include <mkl_cblas.h>
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -1,9 +1,10 @@
-#ifdef newc
+#ifdef newc
-#include <sstream>
+#include <sstream>
-#include <cstdio>
+#include <cstdio>
-#include <map>
+#include <cstdlib>
-using namespace std;
+#include <map>
 using namespace std;
 #else
 #include <stdio.h>
 #include <map.h>
@@ -26,12 +27,20 @@ using namespace std;
 #include "shellfunctions.h"
 #include "cpbc.h"
 #include "kodiss.h"
-#include "parameters.h"
+#include "parameters.h"
-
+
-#ifdef With_AHF
+#ifndef USE_CUDA_Z4C
-#include "derivatives.h"
+#define USE_CUDA_Z4C 0
-#include "myglobal.h"
+#endif
-#endif
+
 #if USE_CUDA_Z4C && (ABEtype == 2)
 #include "z4c_rhs_cuda.h"
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
 #endif
 //================================================================================================
@@ -105,20 +114,22 @@ void Z4c_class::Initialize()
  else
    GH->compose_cgh(nprocs);
-#ifdef WithShell
+#ifdef WithShell
-  SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
+  SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
-  if (!checkrun)
+  if (!checkrun)
-    SH->matchcheck(GH->PatL[0]);
+    SH->matchcheck(GH->PatL[0]);
  SH->compose_sh(nprocs);
  SH->setupcordtrans();
  SH->Dump_xyz(0, 0, 1);
  SH->setupintintstuff(nprocs, GH->PatL[0], Symmetry);
-  if (checkrun)
+  if (checkrun)
-    CheckPoint->readcheck_sh(SH, myrank);
+    CheckPoint->readcheck_sh(SH, myrank);
-#endif
+#endif
-
+
-  double h = GH->PatL[0]->data->blb->data->getdX(0);
+  Initialize_Level_Runtime();
  double h = GH->PatL[0]->data->blb->data->getdX(0);
  for (int i = 1; i < dim; i++)
    h = Mymin(h, GH->PatL[0]->data->blb->data->getdX(i));
  dT = Courant * h;
@@ -167,12 +178,753 @@ Z4c_class::~Z4c_class()
 #define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
-#ifndef CPBC
+#ifndef CPBC
-// for sommerfeld boundary
+// for sommerfeld boundary
-
+
-void Z4c_class::Step(int lev, int YN)
+#if USE_CUDA_Z4C && (ABEtype == 2)
-{
+#ifdef WithShell
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only"
 #endif
 #if (MRBD == 2)
 #error "USE_CUDA_Z4C resident path does not support MRBD == 2"
 #endif
 namespace {
 static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
 bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
                         double **host_views,
                         double *propspeeds = 0,
                         double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < Z4C_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
 }
 bool z4c_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
  static int keep_all_levels = -1;
  if (keep_all_levels < 0)
  {
    const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS");
    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
  }
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP");
    if (env)
      enabled = (atoi(env) != 0) ? 1 : 0;
    else
    {
      env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
      enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
  }
  if (!enabled)
    return false;
  if (lev == analysis_lev)
    return false;
  if (keep_all_levels)
    return true;
  return lev < trfls_in;
 }
 void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
      {
        double *state_out[Z4C_CUDA_STATE_COUNT];
        if (!fill_z4c_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA Z4C state list mismatch on resident state download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
        {
          cout << "CUDA Z4C resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          z4c_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
 {
  if (!patch)
    return false;
  for (int d = 0; d < dim; d++)
  {
    const double h = patch->getdX(d);
    const double lo = patch->bbox[d] + patch->lli[d] * h;
    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
    if (point[d] < lo || point[d] > hi)
      return false;
  }
  return true;
 }
 bool z4c_cuda_point_in_block(Patch *patch, Block *block,
                             const double *point, const double *DH)
 {
  if (!patch || !block)
    return false;
  for (int d = 0; d < dim; d++)
  {
    double llb;
    double uub;
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
 #else
 #ifdef Cell
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + ghost_width * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - ghost_width * DH[d];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
      return false;
  }
  return true;
 }
 int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
 {
  if (!coords || n <= ordn)
    return 0;
  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
  int start = cxi - ordn / 2;
  if (start < 0)
    start = 0;
  const int max_start = n - ordn;
  if (start > max_start)
    start = max_start;
  return start;
 }
 bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
                                       int myrank,
                                       const double *point,
                                       var *forx, var *fory, var *forz,
                                       int Symmetry,
                                       double *shellf)
 {
  const int ordn = 2 * ghost_width;
  int owner_rank = -1;
  shellf[0] = shellf[1] = shellf[2] = 0.0;
  MyList<Patch> *PL = PatL;
  while (PL)
  {
    Patch *patch = PL->data;
    if (!z4c_cuda_patch_contains_point(patch, point))
    {
      PL = PL->next;
      continue;
    }
    double DH[dim];
    for (int d = 0; d < dim; d++)
      DH[d] = patch->getdX(d);
    MyList<Block> *BP = patch->blb;
    while (BP)
    {
      Block *block = BP->data;
      if (z4c_cuda_point_in_block(patch, block, point, DH))
      {
        owner_rank = block->rank;
        if (myrank == owner_rank)
        {
          int interp_ordn = ordn;
          int interp_sym = Symmetry;
          double x = point[0];
          double y = point[1];
          double z = point[2];
          if (z4c_cuda_has_resident_state(block) &&
              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
          {
            var *vars[3] = {forx, fory, forz};
            static int use_device_bh_interp = -1;
            if (use_device_bh_interp < 0)
            {
              const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE");
              use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0;
            }
            bool used_device_interp = false;
            if (use_device_bh_interp)
            {
              double soa3[9];
              for (int f = 0; f < 3; f++)
              {
                soa3[3 * f + 0] = vars[f]->SoA[0];
                soa3[3 * f + 1] = vars[f]->SoA[1];
                soa3[3 * f + 2] = vars[f]->SoA[2];
              }
              used_device_interp =
                  (z4c_cuda_interp_state_point3(block, block->shape,
                                                k_z4c_cuda_bh_state_indices[0],
                                                k_z4c_cuda_bh_state_indices[1],
                                                k_z4c_cuda_bh_state_indices[2],
                                                block->X[0][0], block->X[1][0], block->X[2][0],
                                                DH[0], DH[1], DH[2],
                                                x, y, z,
                                                interp_ordn, interp_sym,
                                                soa3, shellf) == 0);
            }
            if (!used_device_interp)
            {
              double *shift_views[3] = {
                  block->fgfs[forx->sgfn],
                  block->fgfs[fory->sgfn],
                  block->fgfs[forz->sgfn]};
              if (z4c_cuda_download_state_subset(block, block->shape, 3,
                                                 k_z4c_cuda_bh_state_indices,
                                                 shift_views) != 0)
              {
                cout << "CUDA Z4C BH shift download failed" << endl;
                MPI_Abort(MPI_COMM_WORLD, 1);
              }
              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                              block->fgfs[forx->sgfn], shellf[0],
                              x, y, z, interp_ordn, forx->SoA, interp_sym);
              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                              block->fgfs[fory->sgfn], shellf[1],
                              x, y, z, interp_ordn, fory->SoA, interp_sym);
              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                              block->fgfs[forz->sgfn], shellf[2],
                              x, y, z, interp_ordn, forz->SoA, interp_sym);
            }
          }
          else
          {
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forx->sgfn], shellf[0],
                            x, y, z, interp_ordn, forx->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[fory->sgfn], shellf[1],
                            x, y, z, interp_ordn, fory->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forz->sgfn], shellf[2],
                            x, y, z, interp_ordn, forz->SoA, interp_sym);
          }
        }
        break;
      }
      if (BP == patch->ble)
        break;
      BP = BP->next;
    }
    if (owner_rank >= 0)
      break;
    PL = PL->next;
  }
  if (owner_rank < 0)
    return false;
  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
  return true;
 }
 bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
                                        int ilev,
                                        int myrank,
                                        int BH_num,
                                        double **BH_PS,
                                        double **BH_RHS,
                                        var *forx, var *fory, var *forz,
                                        int Symmetry)
 {
  for (int n = 0; n < BH_num; n++)
  {
    double shellf[3] = {0.0, 0.0, 0.0};
    int lev = ilev;
    while (lev >= 0 &&
           !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
                                              forx, fory, forz, Symmetry, shellf))
    {
      --lev;
    }
    if (lev < 0)
      return false;
    BH_RHS[n][0] = -shellf[0];
    BH_RHS[n][1] = -shellf[1];
    BH_RHS[n][2] = -shellf[2];
  }
  return true;
 }
 bool z4c_cuda_download_bh_shift_level(MyList<Patch> *PatL,
                                      int myrank,
                                      var *forx, var *fory, var *forz)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
      {
        double *fields[3] = {
            cg->fgfs[forx->sgfn],
            cg->fgfs[fory->sgfn],
            cg->fgfs[forz->sgfn]};
        if (z4c_cuda_download_state_subset(cg, cg->shape, 3,
                                           k_z4c_cuda_bh_state_indices,
                                           fields))
          return false;
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  return true;
 }
 bool z4c_cuda_refresh_constraint_level(MyList<Patch> *PatL,
                                       int myrank,
                                       var *Cons_Ham, var *Cons_Px,
                                       var *Cons_Py, var *Cons_Pz,
                                       var *Cons_Gx, var *Cons_Gy,
                                       var *Cons_Gz, var *TZ0,
                                       int Symmetry, int lev, double eps)
 {
  bool all_resident = true;
  const int tz_index = 24;
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
        if (!z4c_cuda_has_resident_state(cg))
        {
          all_resident = false;
        }
        else
        {
          double *constraints[7] = {
              cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
              cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
              cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
              cg->fgfs[Cons_Gz->sgfn]};
          double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
          int co = 0;
          if (z4c_cuda_compute_constraints_resident(cg, cg->shape,
                                                   cg->X[0], cg->X[1], cg->X[2],
                                                   Symmetry, eps, co,
                                                   constraints) ||
              z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
          {
            cout << "CUDA Z4C resident constraint refresh failed" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  return all_resident;
 }
 long long &z4c_constraint_output_counter()
 {
  static long long counter = 0;
  return counter;
 }
 int z4c_constraint_output_every()
 {
  static int every = -1;
  if (every < 0)
  {
    const char *env = getenv("AMSS_CUDA_Z4C_CONSTRAINT_EVERY");
    every = (env && atoi(env) > 0) ? atoi(env) : 1;
  }
  return every;
 }
 bool z4c_constraint_output_due_now()
 {
  const int every = z4c_constraint_output_every();
  return every <= 1 || (z4c_constraint_output_counter() % every) == 0;
 }
 void z4c_constraint_output_advance()
 {
  z4c_constraint_output_counter()++;
 }
 } // namespace
 #endif
 void Z4c_class::Step(int lev, int YN)
 {
 #if USE_CUDA_Z4C && (ABEtype == 2)
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
  bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
  double ndeps = numepss;
  if (lev < GH->movls)
    ndeps = numepsb;
  double TRK4 = PhysTime;
  int iter_count = 0;
  int pre = 0, cor = 1;
  int ERROR = 0;
  const double dT_mon = dT * pow(0.5, Mymax(0, trfls));
  const bool need_constraint_after_step =
      (LastConsOut + dT_mon >= AnasTime) && z4c_constraint_output_due_now();
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    if (!z4c_cuda_download_bh_shift_level(GH->PatL[lev], myrank, Sfx0, Sfy0, Sfz0))
    {
      if (myrank == 0 && ErrorMonitor->outfile)
        ErrorMonitor->outfile << "CUDA Z4C failed to download predictor black-hole shift at t = "
                              << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
      if (Symmetry > 0)
        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
      if (Symmetry == 2)
      {
        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
      }
    }
  }
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
        double *state_in[Z4C_CUDA_STATE_COUNT];
        double *state_out[Z4C_CUDA_STATE_COUNT];
        double propspeed[Z4C_CUDA_STATE_COUNT];
        double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
        if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
            !fill_z4c_cuda_views(cg, SynchList_pre, state_out))
        {
          cout << "CUDA Z4C state list mismatch on predictor step" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        int apply_bam_bc = 0;
 #if (MRBD == 0)
 #if (SommerType == 0)
        apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #elif (MRBD == 1)
        apply_bam_bc = 1;
 #endif
        int keep_resident_state = 1;
        int apply_enforce_ga = 0;
 #if (AGM == 0)
        apply_enforce_ga = 1;
 #endif
        if (z4c_cuda_rk4_substep(cg,
                                 cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                 state_in, state_out,
                                 propspeed, soa_flat, Pp->data->bbox,
                                 dT_lev, TRK4, iter_count, apply_bam_bc,
                                 Symmetry, lev, ndeps, pre,
                                 keep_resident_state, apply_enforce_ga, chitiny))
        {
          cout << "CUDA Z4C predictor substep failed in domain: ("
               << cg->bbox[0] << ":" << cg->bbox[3] << ","
               << cg->bbox[1] << ":" << cg->bbox[4] << ","
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    if (myrank == 0 && ErrorMonitor->outfile)
      ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
                            << ", lev = " << lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  {
    Parallel::AsyncSyncState async_pre;
    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
  }
  if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
  if (lev == a_lev)
    AnalysisStuff(lev, dT_lev);
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
    Pp = GH->PatL[lev];
    while (Pp)
    {
      MyList<Block> *BP = Pp->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
          double *state_in[Z4C_CUDA_STATE_COUNT];
          double *state_out[Z4C_CUDA_STATE_COUNT];
          double propspeed[Z4C_CUDA_STATE_COUNT];
          double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
          if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
              !fill_z4c_cuda_views(cg, SynchList_cor, state_out))
          {
            cout << "CUDA Z4C state list mismatch on corrector step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
 #if (MRBD == 0)
 #if (SommerType == 0)
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #elif (MRBD == 1)
          apply_bam_bc = 1;
 #endif
          int keep_resident_state = 1;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #elif (AGM == 1)
          apply_enforce_ga = (iter_count == 3) ? 1 : 0;
 #endif
          if (z4c_cuda_rk4_substep(cg,
                                   cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                   state_in, state_out,
                                   propspeed, soa_flat, Pp->data->bbox,
                                   dT_lev, TRK4, iter_count, apply_bam_bc,
                                   Symmetry, lev, ndeps, cor,
                                   keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA Z4C corrector substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          if (!ERROR && iter_count == 3 && need_constraint_after_step)
          {
            double *constraints[7] = {
                cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
                cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
                cg->fgfs[Cons_Gz->sgfn]};
            double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
            const int tz_index = 24;
            if (z4c_cuda_download_constraint_outputs(cg->shape, constraints) ||
                z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
            {
              cout << "CUDA Z4C constraint download failed in domain: ("
                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
              ERROR = 1;
            }
          }
        }
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
      }
      Pp = Pp->next;
    }
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      if (myrank == 0 && ErrorMonitor->outfile)
        ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
                              << " at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    {
      Parallel::AsyncSyncState async_cor;
      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
    }
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
                                              Porg, Porg1,
                                              Sfx, Sfy, Sfz, Symmetry))
      {
        if (myrank == 0 && ErrorMonitor->outfile)
          ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
                                << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
        if (Symmetry > 0)
          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
        if (Symmetry == 2)
        {
          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
        }
      }
    }
    if (iter_count < 3)
    {
      Pp = GH->PatL[lev];
      while (Pp)
      {
        MyList<Block> *BP = Pp->data->blb;
        while (BP)
        {
          Block *cg = BP->data;
          cg->swapList(SynchList_pre, SynchList_cor, myrank);
          if (BP == Pp->data->ble)
            break;
          BP = BP->next;
        }
        Pp = Pp->next;
      }
      if (BH_num > 0 && lev == GH->levels - 1)
      {
        for (int ithBH = 0; ithBH < BH_num; ithBH++)
        {
          Porg[ithBH][0] = Porg1[ithBH][0];
          Porg[ithBH][1] = Porg1[ithBH][1];
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
    }
  }
  {
    const bool keep_resident = z4c_cuda_keep_resident_after_step(lev, trfls, a_lev);
    const bool need_host_after_step =
        ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime));
    if (!keep_resident || need_host_after_step)
      z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident);
  }
 #if (RPS == 0)
  RestrictProlong(lev, YN, BB);
 #endif
  Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      cg->swapList(StateList, SynchList_cor, myrank);
      cg->swapList(OldStateList, SynchList_cor, myrank);
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      Porg0[ithBH][0] = Porg1[ithBH][0];
      Porg0[ithBH][1] = Porg1[ithBH][1];
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
 #else
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
@@ -1039,15 +1791,19 @@ void Z4c_class::Step(int lev, int YN)
    {
      Porg0[ithBH][0] = Porg1[ithBH][0];
      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
+      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
+    }
-  }
+  }
-}
+#endif
-#else
+}
-// for constraint preserving boundary (CPBC)
+#else
-#ifndef WithShell
+// for constraint preserving boundary (CPBC)
-#error "CPBC only supports Shell"
+#if USE_CUDA_Z4C && (ABEtype == 2)
-#endif
+#error "USE_CUDA_Z4C resident path does not support CPBC"
 #endif
 #ifndef WithShell
 #error "CPBC only supports Shell"
 #endif
 // 0: extroplate rhs, 1: extroplate variable
 // 2: extroplate variable but before RHS calculation
@@ -2408,17 +3164,23 @@ void Z4c_class::Check_extrop()
 //================================================================================================
-void Z4c_class::Constraint_Out()
+void Z4c_class::Constraint_Out()
-{
+{
-  // here we have to use the same variable name as in the parent class
+  // here we have to use the same variable name as in the parent class
-  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
+  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
-  
+  
-  if (LastConsOut >= AnasTime)
+  if (LastConsOut >= AnasTime)
-  // Constraint violation
+  // Constraint violation
-  {
+  {
-    // recompute least the constraint data lost for moved new grid
+#if USE_CUDA_Z4C && (ABEtype == 2)
-    for (int lev = 0; lev < GH->levels; lev++)
+    bool cuda_constraints_ready = true;
-    {
+#else
    const bool cuda_constraints_ready = false;
 #endif
    // recompute least the constraint data lost for moved new grid
    if (!cuda_constraints_ready)
      for (int lev = 0; lev < GH->levels; lev++)
    {
      // make sure the data consistent for higher levels
      if (lev > 0)
      {
--- a/AMSS_NCKU_source/Z4c_rhs.f90
+++ b/AMSS_NCKU_source/Z4c_rhs.f90
@@ -94,29 +94,31 @@
               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon,                    &            
               Symmetry,Lev,eps,co)
-#if (ABV == 0)  
+  if (co == 0) then
-  call ricci_gamma(ex, X, Y, Z,                                      &
+#if (ABV == 0)  
-               chi,                                                  &
+    call ricci_gamma(ex, X, Y, Z,                                      &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 chi,                                                  &
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Symmetry)
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-#endif
+                 Symmetry)
-  call constraint_bssn(ex, X, Y, Z,&
+#endif
-               chi,trK, &
+    call constraint_bssn(ex, X, Y, Z,&
-               dxx,gxy,gxz,dyy,gyz,dzz, &
+                 chi,trK, &
-               Axx,Axy,Axz,Ayy,Ayz,Azz, &
+                 dxx,gxy,gxz,dyy,gyz,dzz, &
-               Gamx,Gamy,Gamz,&
+                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
-               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+                 Gamx,Gamy,Gamz,&
-               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-               Symmetry)
+                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
                 Symmetry)
  endif
  return
@@ -226,11 +228,12 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
-!!! sanity check
+!!! sanity check
-  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+#ifdef DEBUG
-      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
+  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
-      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
+      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
-      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
+      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
      +sum(TZ)
  if(dX.ne.dX) then
     if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs.f90: find NaN in chi"
@@ -257,10 +260,11 @@
     if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs.f90: find NaN in dtSfx"
     if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs.f90: find NaN in dtSfy"
     if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs.f90: find NaN in dtSfz"
-     if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
+     if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
-     gont = 1
+     gont = 1
-     return
+     return
-  endif
+  endif
 #endif
  PI = dacos(-ONE)
@@ -1263,30 +1267,32 @@
  endif
-#if (ABV == 0)  
+  if (co == 0) then
-  call ricci_gamma(ex, X, Y, Z,                                      &
+#if (ABV == 0)  
-               chi,                                                  &
+    call ricci_gamma(ex, X, Y, Z,                                      &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 chi,                                                  &
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Symmetry)
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-#endif
+                 Symmetry)
-
+#endif
-  call constraint_bssn(ex, X, Y, Z,&
+
-               chi,trK, &
+    call constraint_bssn(ex, X, Y, Z,&
-               dxx,gxy,gxz,dyy,gyz,dzz, &
+                 chi,trK, &
-               Axx,Axy,Axz,Ayy,Ayz,Azz, &
+                 dxx,gxy,gxz,dyy,gyz,dzz, &
-               Gamx,Gamy,Gamz,&
+                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
-               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+                 Gamx,Gamy,Gamz,&
-               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-               Symmetry)
+                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
                 Symmetry)
  endif
  gont = 0
--- a/AMSS_NCKU_source/Z4c_rhs_ss.f90
+++ b/AMSS_NCKU_source/Z4c_rhs_ss.f90
@@ -121,11 +121,12 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
-!!! sanity check
+!!! sanity check
-  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+#ifdef DEBUG
-      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
+  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
-      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
+      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
-      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
+      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
      +sum(TZ)
  if(dX.ne.dX) then
     if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs_ss.f90: find NaN in chi"
@@ -152,10 +153,11 @@
     if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfx"
     if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfy"
     if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfz"
-     if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
+     if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
-     gont = 1
+     gont = 1
-     return
+     return
-  endif
+  endif
 #endif
  PI = dacos(-ONE)
@@ -1388,41 +1390,43 @@
  call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
  endif
-#if (ABV == 1)  
+  if (co == 0) then
-  call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                                 &
+#if (ABV == 1)  
-               drhodx, drhody, drhodz,                                         &
+    call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                               &
-               dsigmadx,dsigmady,dsigmadz,                                     &
+                 drhodx, drhody, drhodz,                                       &
-               dRdx,dRdy,dRdz,                                                 &
+                 dsigmadx,dsigmady,dsigmadz,                                   &
-               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
+                 dRdx,dRdy,dRdz,                                               &
-               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
+                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
-               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
+                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
-               chi,                                                  &
+                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 chi,                                                          &
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Symmetry,Lev,sst)
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-  call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
+                 Symmetry,Lev,sst)
-               drhodx, drhody, drhodz,                                         &
+#endif
-               dsigmadx,dsigmady,dsigmadz,                                     &
+    call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
-               dRdx,dRdy,dRdz,                                                 &
+                 drhodx, drhody, drhodz,                                       &
-               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
+                 dsigmadx,dsigmady,dsigmadz,                                   &
-               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
+                 dRdx,dRdy,dRdz,                                               &
-               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
+                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
-               chi,trK, &
+                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
-               dxx,gxy,gxz,dyy,gyz,dzz, &
+                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
-               Axx,Axy,Axz,Ayy,Ayz,Azz, &
+                 chi,trK, &
-               Gamx,Gamy,Gamz,&
+                 dxx,gxy,gxz,dyy,gyz,dzz, &
-               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
-               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+                 Gamx,Gamy,Gamz,&
-               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-               Symmetry,Lev,sst)
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-#endif
+                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
                 Symmetry,Lev,sst)
  endif
  gont = 0
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -15,10 +15,13 @@ using namespace std;
 #include "misc.h"
 #include "Ansorg.h"
 #include "fmisc.h"
-#include "Parallel.h"
+#include "Parallel.h"
-#include "bssnEM_class.h"
+#include "bssnEM_class.h"
-#include "bssn_rhs.h"
+#include "bssn_rhs.h"
-#include "empart.h"
+#if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #include "empart.h"
 #include "initial_puncture.h"
 #include "initial_maxwell.h"
 #include "enforce_algebra.h"
@@ -32,11 +35,111 @@ using namespace std;
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
-#endif
+#endif
-
+
-//================================================================================================
+//================================================================================================
-
+
-// Define bssnEM_class
+#if USE_CUDA_BSSN
 namespace {
 bool fill_bssn_cuda_views_prefix(Block *cg, MyList<var> *vars,
                                 double **host_views,
                                 double *propspeeds = nullptr,
                                 double *soa_flat = nullptr)
 {
  int idx = 0;
  while (vars && idx < BSSN_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_CUDA_STATE_COUNT;
 }
 void skip_bssn_cuda_prefix(MyList<var> *&a, MyList<var> *&b, MyList<var> *&c)
 {
  for (int i = 0; i < BSSN_CUDA_STATE_COUNT && a && b && c; ++i)
  {
    a = a->next;
    b = b->next;
    c = c->next;
  }
 }
 void skip_bssn_cuda_prefix(MyList<var> *&a, MyList<var> *&b,
                           MyList<var> *&c, MyList<var> *&d)
 {
  for (int i = 0; i < BSSN_CUDA_STATE_COUNT && a && b && c && d; ++i)
  {
    a = a->next;
    b = b->next;
    c = c->next;
    d = d->next;
  }
 }
 int run_bssn_em_cuda_substep(Block *cg,
                             MyList<var> *state_in_list,
                             MyList<var> *state_out_list,
                             Patch *patch,
                             double &dT_lev,
                             double &TRK4,
                             int &iter_count,
                             int &Symmetry,
                             int lev,
                             double &ndeps,
                             int &co,
                             double &chitiny,
                             var *rho, var *Sx, var *Sy, var *Sz,
                             var *Sxx, var *Sxy, var *Sxz,
                             var *Syy, var *Syz, var *Szz)
 {
  double *state_in[BSSN_CUDA_STATE_COUNT];
  double *state_out[BSSN_CUDA_STATE_COUNT];
  double *matter[BSSN_CUDA_MATTER_COUNT] = {
      cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
      cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
      cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]};
  double propspeed[BSSN_CUDA_STATE_COUNT];
  double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
  if (!fill_bssn_cuda_views_prefix(cg, state_in_list, state_in, propspeed, soa_flat) ||
      !fill_bssn_cuda_views_prefix(cg, state_out_list, state_out))
    return 1;
  int apply_bam_bc = 0;
 #if (SommerType == 0)
 #ifndef WithShell
  apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
  int use_zero_matter = 0;
  int keep_resident_state = 0;
  int apply_enforce_ga = 0;
  return bssn_cuda_rk4_substep(cg,
                               cg->shape, cg->X[0], cg->X[1], cg->X[2],
                               state_in, state_out, matter,
                               propspeed, soa_flat, patch->bbox,
                               dT_lev, TRK4, iter_count, apply_bam_bc,
                               Symmetry, lev, ndeps, co,
                               use_zero_matter,
                               keep_resident_state, apply_enforce_ga, chitiny);
 }
 }
 #endif
 //================================================================================================
 // Define bssnEM_class
 // It inherits some members and methods from the parent class bssn_class and modifies others.
 // The modified members and methods are defined below (and in the header bssnEM_class.h).
@@ -232,19 +335,21 @@ void bssnEM_class::Initialize()
  else
    GH->compose_cgh(nprocs);
-#ifdef WithShell
+#ifdef WithShell
-  SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
+  SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
-  SH->matchcheck(GH->PatL[0]);
+  SH->matchcheck(GH->PatL[0]);
  SH->compose_sh(nprocs);
  SH->setupcordtrans();
  SH->Dump_xyz(0, 0, 1);
  SH->setupintintstuff(nprocs, GH->PatL[0], Symmetry);
-  if (checkrun)
+  if (checkrun)
-    CheckPoint->readcheck_sh(SH, myrank);
+    CheckPoint->readcheck_sh(SH, myrank);
-#endif
+#endif
-
+
-  double h = GH->PatL[0]->data->blb->data->getdX(0);
+  Initialize_Level_Runtime();
  double h = GH->PatL[0]->data->blb->data->getdX(0);
  for (int i = 1; i < dim; i++)
    h = Mymin(h, GH->PatL[0]->data->blb->data->getdX(i));
  dT = Courant * h;
@@ -258,8 +363,6 @@ void bssnEM_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  setup_transfer_caches();
 }
 //================================================================================================
@@ -853,10 +956,11 @@ void bssnEM_class::Step(int lev, int YN)
                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
-#endif
+#endif
-
+
-        if (
+        bool used_gpu_substep = false;
-            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+        if (
            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -873,11 +977,20 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Kpsi_rhs->sgfn], cg->fgfs[Kphi_rhs->sgfn],
                                 cg->fgfs[rho->sgfn], 
                                 cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
-                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                 Symmetry, lev, ndeps) ||
+                                 Symmetry, lev, ndeps) ||
-            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#if USE_CUDA_BSSN
-                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+            ((used_gpu_substep =
                  (run_bssn_em_cuda_substep(cg, StateList, SynchList_pre, Pp->data,
                                            dT_lev, TRK4, iter_count, Symmetry, lev,
                                            ndeps, pre, chitiny,
                                            rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
                 ? 0
                 : 1) ||
 #endif
            (!used_gpu_substep && f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                               cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
@@ -906,10 +1019,10 @@ void bssnEM_class::Step(int lev, int YN)
                               cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                               cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                               cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                               cg->fgfs[Cons_Ham->sgfn],
+                               cg->fgfs[Cons_Ham->sgfn],
-                               cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                               cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                               cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                               cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                               Symmetry, lev, ndeps, pre))
+                               Symmetry, lev, ndeps, pre)))
        {
          cout << "find NaN in domain: (" 
               << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -919,11 +1032,15 @@ void bssnEM_class::Step(int lev, int YN)
        }
        // rk4 substep and boundary
-        {
+        {
-          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
+          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList;
-          // we do not check the correspondence here
+          // we do not check the correspondence here
-          
+#if USE_CUDA_BSSN
-          while (varl0)
+          if (used_gpu_substep)
            skip_bssn_cuda_prefix(varl0, varl, varlrhs);
 #endif
          while (varl0)
          {
 #ifndef WithShell
            if (lev == 0) // sommerfeld indeed
@@ -1223,7 +1340,7 @@ void bssnEM_class::Step(int lev, int YN)
  }
 #endif
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
 #ifdef WithShell
  if (lev == 0)
@@ -1309,10 +1426,11 @@ void bssnEM_class::Step(int lev, int YN)
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#endif
+#endif
-
+
-          if (
+          bool used_gpu_substep = false;
-              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+          if (
              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                   cg->fgfs[phi->sgfn],
                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1329,11 +1447,20 @@ void bssnEM_class::Step(int lev, int YN)
                                   cg->fgfs[Kpsi1->sgfn], cg->fgfs[Kphi1->sgfn],
                                   cg->fgfs[rho->sgfn], 
                                   cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
-                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                   Symmetry, lev, ndeps) ||
+                                   Symmetry, lev, ndeps) ||
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#if USE_CUDA_BSSN
-                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
+              ((used_gpu_substep =
                    (run_bssn_em_cuda_substep(cg, SynchList_pre, SynchList_cor, Pp->data,
                                              dT_lev, TRK4, iter_count, Symmetry, lev,
                                              ndeps, cor, chitiny,
                                              rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
                   ? 0
                   : 1) ||
 #endif
              (!used_gpu_substep && f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                                 cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
@@ -1361,10 +1488,10 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, cor))
+                                 Symmetry, lev, ndeps, cor)))
          {
            cout << "find NaN in domain: (" 
                 << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -1373,11 +1500,15 @@ void bssnEM_class::Step(int lev, int YN)
            ERROR = 1;
          }
          // rk4 substep and boundary
-          {
+          {
-            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
+            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
-            // we do not check the correspondence here
+            // we do not check the correspondence here
-            
+#if USE_CUDA_BSSN
-            while (varl0)
+            if (used_gpu_substep)
              skip_bssn_cuda_prefix(varl0, varl, varl1, varlrhs);
 #endif
            while (varl0)
            {
 #ifndef WithShell
              if (lev == 0) // sommerfeld indeed
@@ -1685,7 +1816,7 @@ void bssnEM_class::Step(int lev, int YN)
    }
 #endif
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
 #ifdef WithShell
    if (lev == 0)
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
--- a/AMSS_NCKU_source/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/bssnEScalar_class.h
@@ -51,20 +51,24 @@ public:
     void Compute_Psi4(int lev);
     void Step(int lev, int YN);
     void AnalysisStuff_EScalar(int lev, double dT_lev);
-     void Interp_Constraint(bool infg);
+     void Interp_Constraint();
     void Constraint_Out(); 
-protected:
+protected:
-     var *Sphio, *Spio;
+     var *Sphio, *Spio;
-     var *Sphi0, *Spi0;
+     var *Sphi0, *Spi0;
     var *Sphi,  *Spi;
     var *Sphi1, *Spi1;
     var *Sphi_rhs, *Spi_rhs;
-
+
-     var *Cons_fR;
+     var *Cons_fR;
-
+
-     monitor *MaxScalar_Monitor;
+     MyList<var> *BSSNStateList, *BSSNSynchList_pre, *BSSNSynchList_cor;
-};
+     MyList<var> *ScalarSynchList_pre, *ScalarSynchList_cor;
     Parallel::SyncCache *sync_cache_scalar_pre, *sync_cache_scalar_cor;
     monitor *MaxScalar_Monitor;
 };
 #endif /* BSSNESCALAR_CLASS_H */
--- a/AMSS_NCKU_source/bssnEScalar_rhs.f90
+++ b/AMSS_NCKU_source/bssnEScalar_rhs.f90
@@ -3,11 +3,143 @@
 !! note that the potential for scalar field in F(R) gravity
 !!       is defined in the file Set_Rho_ADM.f90
-#include "macrodef.fh"
+#include "macrodef.fh"
-
+
-! rhs for scalar and GR variables
+! scalar RHS and stress-energy only; BSSN RHS can be supplied by CUDA.
-! here we consider vacuum spacetime only
+  function compute_rhs_bssn_escalar_matter(ex, T, X, Y, Z,                    &
-  function compute_rhs_bssn_escalar(ex, T,X, Y, Z,                             &
+               chi    ,   trK    ,                                            &
               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,    &
               Axx    ,   Axy    ,   Axz    ,   Ayy    ,   Ayz    ,   Azz,    &
               Gamx   ,  Gamy    ,  Gamz    ,                                 &
               Lap    ,  betax   ,  betay   ,  betaz   ,                      &
               dtSfx  ,  dtSfy   ,  dtSfz   ,                                 &
               Sphi   ,   Spi    ,                                            &
               Sphi_rhs ,   Spi_rhs ,                                         &
               rho,Sx,Sy,Sz,Sxx,Sxy,Sxz,Syy,Syz,Szz,                          &
               Symmetry,Lev,eps) result(gont)
  implicit none
  integer,intent(in ):: ex(1:3), Symmetry,Lev
  real*8, intent(in ):: T
  real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: chi,dxx,dyy,dzz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: trK
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: gxy,gxz,gyz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: Axx,Axy,Axz,Ayy,Ayz,Azz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: Gamx,Gamy,Gamz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Lap, betax, betay, betaz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: dtSfx,  dtSfy,  dtSfz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in ) :: Sphi,Spi
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Sphi_rhs,Spi_rhs
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: rho,Sx,Sy,Sz
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Sxx,Sxy,Sxz,Syy,Syz,Szz
  real*8,intent(in) :: eps
  integer::gont
  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
  real*8, dimension(ex(1),ex(2),ex(3)) :: chix,chiy,chiz
  real*8, dimension(ex(1),ex(2),ex(3)) :: Lapx,Lapy,Lapz
  real*8, dimension(ex(1),ex(2),ex(3)) :: Kx,Ky,Kz,S
  real*8, dimension(ex(1),ex(2),ex(3)) :: f,fxx,fxy,fxz,fyy,fyz,fzz
  real*8, dimension(ex(1),ex(2),ex(3)) :: alpn1,chin1
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
  real*8            :: dX
  real*8, parameter :: ZEO=0.d0, ONE = 1.D0, TWO = 2.D0, HALF = 0.5D0
  real*8, parameter :: SYM = 1.D0
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
      +sum(Lap)+sum(Sphi)+sum(Spi)
  if(dX.ne.dX) then
     if(sum(chi).ne.sum(chi))write(*,*)"bssn_escalar_matter: find NaN in chi"
     if(sum(trK).ne.sum(trK))write(*,*)"bssn_escalar_matter: find NaN in trk"
     if(sum(dxx).ne.sum(dxx))write(*,*)"bssn_escalar_matter: find NaN in dxx"
     if(sum(gxy).ne.sum(gxy))write(*,*)"bssn_escalar_matter: find NaN in gxy"
     if(sum(gxz).ne.sum(gxz))write(*,*)"bssn_escalar_matter: find NaN in gxz"
     if(sum(dyy).ne.sum(dyy))write(*,*)"bssn_escalar_matter: find NaN in dyy"
     if(sum(gyz).ne.sum(gyz))write(*,*)"bssn_escalar_matter: find NaN in gyz"
     if(sum(dzz).ne.sum(dzz))write(*,*)"bssn_escalar_matter: find NaN in dzz"
     if(sum(Gamx).ne.sum(Gamx))write(*,*)"bssn_escalar_matter: find NaN in Gamx"
     if(sum(Gamy).ne.sum(Gamy))write(*,*)"bssn_escalar_matter: find NaN in Gamy"
     if(sum(Gamz).ne.sum(Gamz))write(*,*)"bssn_escalar_matter: find NaN in Gamz"
     if(sum(Lap).ne.sum(Lap))write(*,*)"bssn_escalar_matter: find NaN in Lap"
     if(sum(Sphi).ne.sum(Sphi))write(*,*)"bssn_escalar_matter: find NaN in Sphi"
     if(sum(Spi).ne.sum(Spi))write(*,*)"bssn_escalar_matter: find NaN in Spi"
     gont = 1
     return
  endif
  alpn1 = Lap + ONE
  chin1 = chi + ONE
  gxx = dxx + ONE
  gyy = dyy + ONE
  gzz = dzz + ONE
  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
 #if 1
  Sphi_rhs = alpn1 * Spi
  call fderivs(ex,Sphi,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  call fdderivs(ex,Sphi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  Spi_rhs =   gupxx * fxx + gupyy * fyy + gupzz * fzz +           &
             ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO -  &
             ((Gamx+(gupxx*chix+gupxy*chiy+gupxz*chiz)/TWO/chin1)*Kx  &
            + (Gamy+(gupxy*chix+gupyy*chiy+gupyz*chiz)/TWO/chin1)*Ky  &
            + (Gamz+(gupxz*chix+gupyz*chiy+gupzz*chiz)/TWO/chin1)*Kz)
  Spi_rhs = Spi_rhs*alpn1 +                                      &
            (gupxx*Lapx*Kx + gupxy*Lapx*Ky + gupxz*Lapx*Kz       &
            +gupxy*Lapy*Kx + gupyy*Lapy*Ky + gupyz*Lapy*Kz       &
            +gupxz*Lapz*Kx + gupyz*Lapz*Ky + gupzz*Lapz*Kz)
  call frpotential(ex,Sphi,f,S)
  Spi_rhs = Spi_rhs*chin1 + alpn1*(trK*Spi - S)
  rho = chin1*((gupxx * Kx * Kx + gupyy * Ky * Ky + gupzz * Kz * Kz)/TWO + &
                gupxy * Kx * Ky + gupxz * Kx * Kz + gupyz * Ky * Kz )      &
        + Spi*Spi/TWO+f
  Sx = -Spi*Kx
  Sy = -Spi*Ky
  Sz = -Spi*Kz
  f = (rho - Spi*Spi)/chin1
  Sxx = Kx*Kx-f*gxx
  Sxy = Kx*Ky-f*gxy
  Sxz = Kx*Kz-f*gxz
  Syy = Ky*Ky-f*gyy
  Syz = Ky*Kz-f*gyz
  Szz = Kz*Kz-f*gzz
 #else
  Sphi_rhs = ZEO
  Spi_rhs = ZEO
  rho = ZEO
  Sx = ZEO
  Sy = ZEO
  Sz = ZEO
  Sxx = ZEO
  Sxy = ZEO
  Sxz = ZEO
  Syy = ZEO
  Syz = ZEO
  Szz = ZEO
 #endif
  gont = 0
  return
 end function compute_rhs_bssn_escalar_matter
 ! rhs for scalar and GR variables
 ! here we consider vacuum spacetime only
  function compute_rhs_bssn_escalar(ex, T,X, Y, Z,                             &
               chi    ,   trK    ,                                             &
               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,     &
               Axx    ,   Axy    ,   Axz    ,   Ayy    ,   Ayz    ,   Azz,     &
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -31,19 +31,11 @@ using namespace std;
 #include "surface_integral.h"
 #include "checkpoint.h"
-extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
+extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
-
+
-#ifndef BSSN_USE_TRANSFER_CACHE
+class bssn_class
-#define BSSN_USE_TRANSFER_CACHE 1
+{
-#endif
+public:
 #ifndef BSSN_USE_ESCALAR_C_KERNEL
 #define BSSN_USE_ESCALAR_C_KERNEL 1
 #endif
 class bssn_class
 {
 public:
       int ngfs;
       int nprocs, myrank;
       cgh *GH;
@@ -56,6 +48,7 @@ public:
       double StartTime, TotalTime;
       double AnasTime, DumpTime, d2DumpTime, CheckTime;
       double LastAnas, LastConsOut;
       bool cuda_level0_constraint_cache_valid;
       int *ConstraintRefreshLevels;
       double Courant;
       double numepss, numepsb, numepsh;
@@ -175,35 +168,27 @@ public:
       void Setup_KerrSchild();
       void Enforce_algcon(int lev, int fg);
-       void testRestrict();
+       void testRestrict();
-       void testOutBd();
+       void testOutBd();
-       
+       
-       bool check_Stdin_Abort(); 
+       bool check_Stdin_Abort(); 
-       bool use_transfer_cache() const;
+
-       void setup_transfer_caches();
+       virtual void Setup_Initial_Data_Cao();
-       void invalidate_transfer_caches();
+       virtual void Setup_Initial_Data_Lousto();
-       void destroy_transfer_caches();
+       virtual void Initialize();
       void sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array = 0);
       void restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       void outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       virtual void Setup_Initial_Data_Cao();
       virtual void Setup_Initial_Data_Lousto();
       virtual void Initialize();
       virtual void Read_Ansorg();
       virtual void Read_Pablo() {};
-       virtual void Compute_Psi4(int lev);
+       virtual void Compute_Psi4(int lev);
-       virtual void Step(int lev, int YN);
+       virtual void Step(int lev, int YN);
-       virtual void Interp_Constraint(bool infg);
+       virtual void Interp_Constraint(bool infg);
-       virtual void Constraint_Out();
+       virtual void Constraint_Out();
-       virtual void Compute_Constraint();
+       virtual void Compute_Constraint();
-
+
-#ifdef With_AHF
+protected:
-protected:
+       void Initialize_Level_Runtime();
 #ifdef With_AHF
 protected:
       MyList<var> *AHList, *AHDList, *GaugeList;
       int AHfindevery;
       double AHdumptime;
--- a/AMSS_NCKU_source/bssn_escalar_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_escalar_rhs_c.C
@@ -1,169 +0,0 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
 #include <vector>
 namespace
 {
    // Reuse the temporary workspace across block calls to avoid repeated heap churn
    // in the EScalar wrapper. MPI ranks execute this path sequentially, so a single
    // process-local buffer is sufficient here.
    std::vector<double> g_escalar_tmp_store;
 }
 #ifdef fortran1
 #define f_frpotential frpotential
 #endif
 #ifdef fortran2
 #define f_frpotential FRPOTENTIAL
 #endif
 #ifdef fortran3
 #define f_frpotential frpotential_
 #endif
 extern "C"
 {
    void f_frpotential(int *, double *, double *, double *);
 }
 int f_compute_rhs_bssn_escalar_c(int *ex, double &T,
                                 double *X, double *Y, double *Z,
                                 double *chi, double *trK,
                                 double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                 double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                 double *Gamx, double *Gamy, double *Gamz,
                                 double *Lap, double *betax, double *betay, double *betaz,
                                 double *dtSfx, double *dtSfy, double *dtSfz,
                                 double *Sphi, double *Spi,
                                 double *chi_rhs, double *trK_rhs,
                                 double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                 double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                 double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                 double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                 double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                 double *Sphi_rhs, double *Spi_rhs,
                                 double *rho, double *Sx, double *Sy, double *Sz,
                                 double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                 double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                 double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                 double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                 double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                 double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                                 double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                                 int &Symmetry, int &Lev, double &eps, int &co)
 {
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const size_t workspace_size = size_t(all) * 17;
    if (g_escalar_tmp_store.size() < workspace_size)
        g_escalar_tmp_store.resize(workspace_size);
    double *tmp_ptr = g_escalar_tmp_store.data();
    auto alloc_tmp = [&](int n = 1) -> double *
    {
        double *ptr = tmp_ptr;
        tmp_ptr += size_t(all) * n;
        return ptr;
    };
    double *chix = alloc_tmp(), *chiy = alloc_tmp(), *chiz = alloc_tmp();
    double *Kx = alloc_tmp(), *Ky = alloc_tmp(), *Kz = alloc_tmp();
    double *fxx = alloc_tmp(), *fxy = alloc_tmp(), *fxz = alloc_tmp();
    double *fyy = alloc_tmp(), *fyz = alloc_tmp(), *fzz = alloc_tmp();
    double *Lapx = alloc_tmp(), *Lapy = alloc_tmp(), *Lapz = alloc_tmp();
    double *V = alloc_tmp(), *dVdSphi = alloc_tmp();
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, HALF = 0.5;
    const double SSS[3] = {1.0, 1.0, 1.0};
    fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Sphi, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, Sphi, fxx, fxy, fxz, fyy, fyz, fzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    f_frpotential(ex, Sphi, V, dVdSphi);
    for (int i = 0; i < all; ++i)
    {
        const double alpn1 = Lap[i] + ONE;
        const double chin1 = chi[i] + ONE;
        const double gxx = dxx[i] + ONE;
        const double gyy = dyy[i] + ONE;
        const double gzz = dzz[i] + ONE;
        const double det = gxx * gyy * gzz + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i]
                         - gxz[i] * gyy * gxz[i] - gxy[i] * gxy[i] * gzz - gxx * gyz[i] * gyz[i];
        const double gupxx = (gyy * gzz - gyz[i] * gyz[i]) / det;
        const double gupxy = -(gxy[i] * gzz - gyz[i] * gxz[i]) / det;
        const double gupxz = (gxy[i] * gyz[i] - gyy * gxz[i]) / det;
        const double gupyy = (gxx * gzz - gxz[i] * gxz[i]) / det;
        const double gupyz = -(gxx * gyz[i] - gxy[i] * gxz[i]) / det;
        const double gupzz = (gxx * gyy - gxy[i] * gxy[i]) / det;
        Sphi_rhs[i] = alpn1 * Spi[i];
        Spi_rhs[i] = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
                   + TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
                   - ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) / TWO / chin1) * Kx[i]
                   +  (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) / TWO / chin1) * Ky[i]
                   +  (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) / TWO / chin1) * Kz[i]);
        Spi_rhs[i] = Spi_rhs[i] * alpn1
                   + gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
                   + gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
                   + gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i];
        Spi_rhs[i] = Spi_rhs[i] * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi[i]);
        rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
               + gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
               + Spi[i] * Spi[i] * HALF + V[i];
        Sx[i] = -Spi[i] * Kx[i];
        Sy[i] = -Spi[i] * Ky[i];
        Sz[i] = -Spi[i] * Kz[i];
        const double pressure = (rho[i] - Spi[i] * Spi[i]) / chin1;
        Sxx[i] = Kx[i] * Kx[i] - pressure * gxx;
        Sxy[i] = Kx[i] * Ky[i] - pressure * gxy[i];
        Sxz[i] = Kx[i] * Kz[i] - pressure * gxz[i];
        Syy[i] = Ky[i] * Ky[i] - pressure * gyy;
        Syz[i] = Ky[i] * Kz[i] - pressure * gyz[i];
        Szz[i] = Kz[i] * Kz[i] - pressure * gzz;
    }
    if (f_compute_rhs_bssn(ex, T, X, Y, Z,
                           chi, trK,
                           dxx, gxy, gxz, dyy, gyz, dzz,
                           Axx, Axy, Axz, Ayy, Ayz, Azz,
                           Gamx, Gamy, Gamz,
                           Lap, betax, betay, betaz,
                           dtSfx, dtSfy, dtSfz,
                           chi_rhs, trK_rhs,
                           gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
                           Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
                           Gamx_rhs, Gamy_rhs, Gamz_rhs,
                           Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
                           dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
                           rho, Sx, Sy, Sz,
                           Sxx, Sxy, Sxz, Syy, Syz, Szz,
                           Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                           Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                           Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                           Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                           ham_Res, movx_Res, movy_Res, movz_Res,
                           Gmx_Res, Gmy_Res, Gmz_Res,
                           Symmetry, Lev, eps, co))
        return 1;
    lopsided_kodis(ex, X, Y, Z, Sphi, Sphi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    lopsided_kodis(ex, X, Y, Z, Spi, Spi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    for (int i = 0; i < all; ++i)
    {
        if (Sphi_rhs[i] != Sphi_rhs[i] || Spi_rhs[i] != Spi_rhs[i] || rho[i] != rho[i])
            return 1;
    }
    return 0;
 }
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -1,73 +0,0 @@
 #ifndef BSSN_GPU_H_
 #define BSSN_GPU_H_
 #include "bssn_macro.h"
 #include "macrodef.fh"
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
 #define GRID_DIM 256
 #define BLOCK_DIM 128
 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
 #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
 #define pow2(x) ((x) * (x))
 #define TimeBetween(a, b) ((b.tv_sec - a.tv_sec) + (b.tv_usec - a.tv_usec) / 1000000.0f)
 #define M_ metac.
 #define Mh_ meta->
 #define Ms_ metassc.
 #define Msh_ metass->
 // #define TIMING
 #define RHS_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
 /**  main function */
 int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
            double *X, double *Y, double *Z,
            double *chi, double *trK,
            double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
            double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
            double *Gamx, double *Gamy, double *Gamz,
            double *Lap, double *betax, double *betay, double *betaz,
            double *dtSfx, double *dtSfy, double *dtSfz,
            double *chi_rhs, double *trK_rhs,
            double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
            double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
            double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
            double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
            double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
            double *rho, double *Sx, double *Sy, double *Sz, double *Sxx,
            double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
            double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
            double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
            double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
            double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
            double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
            int &Symmetry, int &Lev, double &eps, int &co);
 int gpu_rhs_ss(RHS_SS_PARA);
 /** Init GPU side data in GPUMeta. */
 // void init_fluid_meta_gpu(GPUMeta *gpu_meta);
 #endif
--- a/AMSS_NCKU_source/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/bssn_gpu_class.C
--- a/AMSS_NCKU_source/bssn_gpu_class.h
+++ b/AMSS_NCKU_source/bssn_gpu_class.h
@@ -1,210 +0,0 @@
 #ifndef BSSN_GPU_CLASS_H
 #define BSSN_GPU_CLASS_H
 #ifdef newc
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cstdlib>
 #include <string>
 #include <cmath>
 using namespace std;
 #else
 #include <iostream.h>
 #include <iomanip.h>
 #include <fstream.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #endif
 #include <mpi.h>
 #include "macrodef.h"
 #include "cgh.h"
 #include "ShellPatch.h"
 #include "misc.h"
 #include "var.h"
 #include "MyList.h"
 #include "monitor.h"
 #include "surface_integral.h"
 #include "checkpoint.h"
 // added by yangquan
 #include "bssn_macro.h"
 extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
 class bssn_class
 {
 public:
       // added by yangquan
       //----------------------
       int gpu_num_mynode;
       int cpu_core_num_mynode;
       int mpi_process_num_mynode;
       int my_sequence_mynode;
       int mynode_id;
       int use_gpu;
       virtual void Step_GPU(int lev, int YN);
       virtual void Get_runtime_envirment();
       // virtual void Step_OPENMP(int lev,int YN);
       //----------------------
       int ngfs;
       int nprocs, myrank;
       cgh *GH;
       ShellPatch *SH;
       double PhysTime;
       int checkrun;
       char checkfilename[50];
       int Steps;
       double StartTime, TotalTime;
       double AnasTime, DumpTime, d2DumpTime, CheckTime;
       double LastAnas, LastConsOut;
       double Courant;
       double numepss, numepsb, numepsh;
       int Symmetry;
       int maxl, decn;
       double maxrex, drex;
       int trfls, a_lev;
       double dT;
       double chitiny;
       double **Porg0, **Porgbr, **Porg, **Porg1, **Porg_rhs;
       int BH_num, BH_num_input;
       double *Mass, *Pmom, *Spin;
       double ADMMass;
       var *phio, *trKo;
       var *gxxo, *gxyo, *gxzo, *gyyo, *gyzo, *gzzo;
       var *Axxo, *Axyo, *Axzo, *Ayyo, *Ayzo, *Azzo;
       var *Gmxo, *Gmyo, *Gmzo;
       var *Lapo, *Sfxo, *Sfyo, *Sfzo;
       var *dtSfxo, *dtSfyo, *dtSfzo;
       var *phi0, *trK0;
       var *gxx0, *gxy0, *gxz0, *gyy0, *gyz0, *gzz0;
       var *Axx0, *Axy0, *Axz0, *Ayy0, *Ayz0, *Azz0;
       var *Gmx0, *Gmy0, *Gmz0;
       var *Lap0, *Sfx0, *Sfy0, *Sfz0;
       var *dtSfx0, *dtSfy0, *dtSfz0;
       var *phi, *trK;
       var *gxx, *gxy, *gxz, *gyy, *gyz, *gzz;
       var *Axx, *Axy, *Axz, *Ayy, *Ayz, *Azz;
       var *Gmx, *Gmy, *Gmz;
       var *Lap, *Sfx, *Sfy, *Sfz;
       var *dtSfx, *dtSfy, *dtSfz;
       var *phi1, *trK1;
       var *gxx1, *gxy1, *gxz1, *gyy1, *gyz1, *gzz1;
       var *Axx1, *Axy1, *Axz1, *Ayy1, *Ayz1, *Azz1;
       var *Gmx1, *Gmy1, *Gmz1;
       var *Lap1, *Sfx1, *Sfy1, *Sfz1;
       var *dtSfx1, *dtSfy1, *dtSfz1;
       var *phi_rhs, *trK_rhs;
       var *gxx_rhs, *gxy_rhs, *gxz_rhs, *gyy_rhs, *gyz_rhs, *gzz_rhs;
       var *Axx_rhs, *Axy_rhs, *Axz_rhs, *Ayy_rhs, *Ayz_rhs, *Azz_rhs;
       var *Gmx_rhs, *Gmy_rhs, *Gmz_rhs;
       var *Lap_rhs, *Sfx_rhs, *Sfy_rhs, *Sfz_rhs;
       var *dtSfx_rhs, *dtSfy_rhs, *dtSfz_rhs;
       var *rho, *Sx, *Sy, *Sz, *Sxx, *Sxy, *Sxz, *Syy, *Syz, *Szz;
       var *Gamxxx, *Gamxxy, *Gamxxz, *Gamxyy, *Gamxyz, *Gamxzz;
       var *Gamyxx, *Gamyxy, *Gamyxz, *Gamyyy, *Gamyyz, *Gamyzz;
       var *Gamzxx, *Gamzxy, *Gamzxz, *Gamzyy, *Gamzyz, *Gamzzz;
       var *Rxx, *Rxy, *Rxz, *Ryy, *Ryz, *Rzz;
       var *Rpsi4, *Ipsi4;
       var *t1Rpsi4, *t1Ipsi4, *t2Rpsi4, *t2Ipsi4;
       var *Cons_Ham, *Cons_Px, *Cons_Py, *Cons_Pz, *Cons_Gx, *Cons_Gy, *Cons_Gz;
 #ifdef Point_Psi4
       var *phix, *phiy, *phiz;
       var *trKx, *trKy, *trKz;
       var *Axxx, *Axxy, *Axxz;
       var *Axyx, *Axyy, *Axyz;
       var *Axzx, *Axzy, *Axzz;
       var *Ayyx, *Ayyy, *Ayyz;
       var *Ayzx, *Ayzy, *Ayzz;
       var *Azzx, *Azzy, *Azzz;
 #endif
       // FIXME: uc = StateList, up = OldStateList, upp = SynchList_cor; so never touch these three data
       MyList<var> *StateList, *SynchList_pre, *SynchList_cor, *RHSList;
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
       checkpoint *CheckPoint;
 public:
       bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
                  int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
                  int a_levi, int maxli, int decni, double maxrexi, double drexi);
       ~bssn_class();
       void Evolve(int Steps);
       void RecursiveStep(int lev);
 #if (PSTR == 1)
       void ParallelStep();
       void SHStep();
 #endif
       void RestrictProlong(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
       void RestrictProlong_aux(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
       void RestrictProlong(int lev, int YN, bool BB);
       void ProlongRestrict(int lev, int YN, bool BB);
       void Setup_Black_Hole_position();
       void compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int lev);
       bool read_Pablo_file(int *ext, double *datain, char *filename);
       void write_Pablo_file(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
                             char *filename);
       void AnalysisStuff(int lev, double dT_lev);
       void Setup_KerrSchild();
       void Enforce_algcon(int lev, int fg);
       void testRestrict();
       void testOutBd();
       virtual void Setup_Initial_Data_Lousto();
       virtual void Setup_Initial_Data_Cao();
       virtual void Initialize();
       virtual void Read_Ansorg();
       virtual void Read_Pablo() {};
       virtual void Compute_Psi4(int lev);
       virtual void Step(int lev, int YN);
       virtual void Interp_Constraint(bool infg);
       virtual void Constraint_Out();
       virtual void Compute_Constraint();
 #ifdef With_AHF
 protected:
       MyList<var> *AHList, *AHDList, *GaugeList;
       int AHfindevery;
       double AHdumptime;
       int *lastahdumpid, HN_num; // number of possible horizons
       int *findeveryl;
       double *xc, *yc, *zc, *xr, *yr, *zr;
       bool *trigger;
       double *dTT;
       int *dumpid;
 public:
       void AH_Prepare_derivatives();
       bool AH_Interp_Points(MyList<var> *VarList,
                             int NN, double **XX,
                             double *Shellf, int Symmetryi);
       void AH_Step_Find(int lev, double dT_lev);
 #endif
 };
 #endif /* BSSN_GPU_CLASS_H */
--- a/AMSS_NCKU_source/bssn_rhs.h
+++ b/AMSS_NCKU_source/bssn_rhs.h
@@ -5,8 +5,9 @@
 #ifdef fortran1
 #define f_compute_rhs_bssn compute_rhs_bssn
 #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss
-#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar
+#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar
-#define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss
+#define f_compute_rhs_bssn_escalar_matter compute_rhs_bssn_escalar_matter
 #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss
 #define f_compute_rhs_Z4c compute_rhs_z4c
 #define f_compute_rhs_Z4cnot compute_rhs_z4cnot
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss
@@ -15,8 +16,9 @@
 #ifdef fortran2
 #define f_compute_rhs_bssn COMPUTE_RHS_BSSN
 #define f_compute_rhs_bssn_ss COMPUTE_RHS_BSSN_SS
-#define f_compute_rhs_bssn_escalar COMPUTE_RHS_BSSN_ESCALAR
+#define f_compute_rhs_bssn_escalar COMPUTE_RHS_BSSN_ESCALAR
-#define f_compute_rhs_bssn_escalar_ss COMPUTE_RHS_BSSN_ESCALAR_SS
+#define f_compute_rhs_bssn_escalar_matter COMPUTE_RHS_BSSN_ESCALAR_MATTER
 #define f_compute_rhs_bssn_escalar_ss COMPUTE_RHS_BSSN_ESCALAR_SS
 #define f_compute_rhs_Z4c COMPUTE_RHS_Z4C
 #define f_compute_rhs_Z4cnot COMPUTE_RHS_Z4CNOT
 #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
@@ -25,8 +27,9 @@
 #ifdef fortran3
 #define f_compute_rhs_bssn compute_rhs_bssn_
 #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
-#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
+#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
-#define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
+#define f_compute_rhs_bssn_escalar_matter compute_rhs_bssn_escalar_matter_
 #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
 #define f_compute_rhs_Z4c compute_rhs_z4c_
 #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
@@ -63,34 +66,13 @@ extern "C"
                               double *, double *, double *, double *, double *, double *,                                         // Christoffel
                               double *, double *, double *, double *, double *, double *,                                         // Christoffel
                               double *, double *, double *, double *, double *, double *,                                         // Ricci
-                               double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
+                               double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
-                               int &, int &, double &, int &);
+                               int &, int &, double &, int &);
-}
+}
-
+
-int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
+extern "C"
-                                 double *, double *,                                                                                 // chi, trK
+{
-                                 double *, double *, double *, double *, double *, double *,                                         // gij
+        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Ricci
                                 double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
                                 int &, int &, double &, int &);
 extern "C"
 {
        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
                                  double *, double *, double *,                                                                       // X,Y,Z
                                  double *, double *, double *,                                                                       // drhodx,drhody,drhodz
                                  double *, double *, double *,                                                                       // dsigmadx,dsigmady,dsigmadz
@@ -117,6 +99,20 @@ extern "C"
                                  int &, int &, double &, int &, int &);
 }
 extern "C"
 {
        int f_compute_rhs_bssn_escalar_matter(int *, double &, double *, double *, double *,                                             // ex,T,X,Y,Z
                                              double *, double *,                                                                        // chi, trK
                                              double *, double *, double *, double *, double *, double *,                                // gij
                                              double *, double *, double *, double *, double *, double *,                                // Aij
                                              double *, double *, double *,                                                              // Gam
                                              double *, double *, double *, double *, double *, double *, double *,                      // Gauge
                                              double *, double *,                                                                        // Sphi, Spi
                                              double *, double *,                                                                        // Sphi, Spi rhs
                                              double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
                                              int &, int &, double &);
 }
 extern "C"
 {
        int f_compute_rhs_bssn_escalar(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
@@ -137,14 +133,14 @@ extern "C"
                                       double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                       double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                       double *, double *, double *, double *, double *, double *,                                         // Ricci
-                                       double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
+                                       double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
-                                       int &, int &, double &, int &);
+                                       int &, int &, double &, int &);
-}
+}
-
+
-extern "C"
+extern "C"
-{
+{
-        int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
+        int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
-                                          double *, double *, double *,                                                                       // X,Y,Z
+                                          double *, double *, double *,                                                                       // X,Y,Z
                                          double *, double *, double *,                                                                       // drhodx,drhody,drhodz
                                          double *, double *, double *,                                                                       // dsigmadx,dsigmady,dsigmadz
                                          double *, double *, double *,                                                                       // dRdx,dRdy,dRdz
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -1098,12 +1098,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
            betaz_rhs[i] = FF * dtSfz[i];
            reta[i] =
-                gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
+                        + gupyz[i] * chiy[i] * chiz[i] );
            #if (GAUGE == 2)
            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
@@ -1116,12 +1116,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
            dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
            #elif (GAUGE == 4 || GAUGE == 5)
            reta[i] =
-                gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
+                        + gupyz[i] * chiy[i] * chiz[i] );
            #if (GAUGE == 4)
            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -0,0 +1,470 @@
 #ifndef BSSN_RHS_CUDA_H
 #define BSSN_RHS_CUDA_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum {
    BSSN_CUDA_STATE_COUNT = 24,
    BSSN_CUDA_MATTER_COUNT = 10
 };
 int f_compute_rhs_bssn(int *ex, double &T,
                       double *X, double *Y, double *Z,
                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
                       double *Lap, double *betax, double *betay, double *betaz,
                       double *dtSfx, double *dtSfy, double *dtSfz,
                       double *chi_rhs, double *trK_rhs,
                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                       double *rho, double *Sx, double *Sy, double *Sz,
                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                       int &Symmetry, int &Lev, double &eps, int &co);
 int bssn_cuda_rk4_substep(void *block_tag,
                          int *ex, double *X, double *Y, double *Z,
                          double **state_host_in,
                          double **state_host_out,
                          double **matter_host,
                          const double *propspeed,
                          const double *soa_flat,
                          const double *bbox,
                          double &dT,
                          double &T,
                          int &RK4,
                          int &apply_bam_bc,
                          int &Symmetry,
                          int &Lev,
                          double &eps,
                          int &co,
                          int &use_zero_matter,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny);
 int bssn_cuda_compute_escalar_matter(void *block_tag,
                                     int *ex, double *X, double *Y, double *Z,
                                     double **state_host_in,
                                     double *Sphi_host,
                                     double *Spi_host,
                                     double *Sphi_rhs_host,
                                     double *Spi_rhs_host,
                                     double a2,
                                     int &Symmetry,
                                     int &Lev,
                                     double &eps,
                                     int &co,
                                     int &apply_enforce_ga);
 int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
                                             int *ex, double *X, double *Y, double *Z,
                                             double *Sphi_out_host,
                                             double *Spi_out_host,
                                             const double *propspeed,
                                             const double *soa_flat,
                                             const double *bbox,
                                             double &dT,
                                             int &RK4,
                                             int &apply_bam_bc,
                                             int &Symmetry,
                                             int &Lev,
                                             double &eps,
                                             int &precor);
 int bssn_cuda_escalar_has_resident_fields(void *block_tag,
                                          double *Sphi_host,
                                          double *Spi_host);
 int bssn_cuda_escalar_has_any_resident_fields(void *block_tag);
 int bssn_cuda_escalar_download_fields_if_present(void *block_tag,
                                                 int *ex,
                                                 double *Sphi_host,
                                                 double *Spi_host);
 int bssn_cuda_pack_escalar_batch_to_host_buffer(void *block_tag,
                                                double **scalar_host_key,
                                                double *host_buffer,
                                                int *ex,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz);
 int bssn_cuda_unpack_escalar_batch_from_host_buffer(void *block_tag,
                                                    double **scalar_host_key,
                                                    double *host_buffer,
                                                    int *ex,
                                                    int i0, int j0, int k0,
                                                    int sx, int sy, int sz);
 int bssn_cuda_pack_escalar_batch_to_device_buffer(void *block_tag,
                                                  double **scalar_host_key,
                                                  double *device_buffer,
                                                  int *ex,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 int bssn_cuda_unpack_escalar_batch_from_device_buffer(void *block_tag,
                                                      double **scalar_host_key,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int i0, int j0, int k0,
                                                      int sx, int sy, int sz);
 int bssn_cuda_restrict_escalar_batch_to_host_buffer(void *block_tag,
                                                    double **scalar_host_key,
                                                    double *host_buffer,
                                                    int *ex,
                                                    int sx, int sy, int sz,
                                                    int fi0, int fj0, int fk0,
                                                    const double *scalar_soa);
 int bssn_cuda_prolong_escalar_batch_to_host_buffer(void *block_tag,
                                                   double **scalar_host_key,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int ii0, int jj0, int kk0,
                                                   int lbc_i, int lbc_j, int lbc_k,
                                                   const double *scalar_soa);
 int bssn_cuda_restrict_escalar_batch_to_device_buffer(void *block_tag,
                                                      double **scalar_host_key,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int sx, int sy, int sz,
                                                      int fi0, int fj0, int fk0,
                                                      const double *scalar_soa);
 int bssn_cuda_prolong_escalar_batch_to_device_buffer(void *block_tag,
                                                     double **scalar_host_key,
                                                     double *device_buffer,
                                                     int *ex,
                                                     int sx, int sy, int sz,
                                                     int ii0, int jj0, int kk0,
                                                     int lbc_i, int lbc_j, int lbc_k,
                                                     const double *scalar_soa);
 int bssn_cuda_prepare_escalar_inter_time_level(void *block_tag,
                                               int *ex,
                                               double **src1_host_key,
                                               double **src2_host_key,
                                               double **src3_host_key,
                                               double **dst_host_key,
                                               int source_count,
                                               int tindex);
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
                                        int *ex,
                                        int i0, int j0, int k0,
                                        int sx, int sy, int sz);
 int bssn_cuda_copy_state_region_from_host(void *block_tag,
                                          int state_index,
                                          double *host_state,
                                          int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz);
 int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out);
 int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                int *ex,
                                                double **state_host_out);
 int bssn_cuda_resident_state_matches(void *block_tag,
                                     double **state_host_key);
 int bssn_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out);
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int state_index,
                                               double *host_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int bssn_cuda_interp_state_point3(void *block_tag,
                                  int *ex,
                                  int state0,
                                  int state1,
                                  int state2,
                                  double x0,
                                  double y0,
                                  double z0,
                                  double dx,
                                  double dy,
                                  double dz,
                                  double px,
                                  double py,
                                  double pz,
                                  int ordn,
                                  int symmetry,
                                  const double *soa3,
                                  double *out3);
 int bssn_cuda_interp_host_two_fields(void *block_tag,
                                     int *ex,
                                     double *field0,
                                     double *field1,
                                     double x0,
                                     double y0,
                                     double z0,
                                     double dx,
                                     double dy,
                                     double dz,
                                     const double *px,
                                     const double *py,
                                     const double *pz,
                                     int npoints,
                                     int ordn,
                                     int symmetry,
                                     const double *soa6,
                                     double *out_interleaved);
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int state_count,
                                              double *host_buffer,
                                              int *ex,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                             double **state_host_key,
                                                             int state_count,
                                                             double *host_buffer,
                                                             int *ex,
                                                             int i0, int j0, int k0,
                                                             int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
                                                  int *ex,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *host_buffer,
                                                                 int *ex,
                                                                 int i0, int j0, int k0,
                                                                 int sx, int sy, int sz);
 int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *host_buffer,
                                                                 int *ex,
                                                                 int sx, int sy, int sz,
                                                                 int fi0, int fj0, int fk0,
                                                                 const double *state_soa);
 int bssn_cuda_restrict_state_batch_to_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
                                                  int *ex,
                                                  int sx, int sy, int sz,
                                                  int fi0, int fj0, int fk0,
                                                  const double *state_soa);
 int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                                double **state_host_key,
                                                                int state_count,
                                                                double *host_buffer,
                                                                int *ex,
                                                                int sx, int sy, int sz,
                                                                int ii0, int jj0, int kk0,
                                                                int lbc_i, int lbc_j, int lbc_k,
                                                                const double *state_soa);
 int bssn_cuda_prolong_state_batch_to_host_buffer(void *block_tag,
                                                 int state_count,
                                                 double *host_buffer,
                                                 int *ex,
                                                 int sx, int sy, int sz,
                                                 int ii0, int jj0, int kk0,
                                                 int lbc_i, int lbc_j, int lbc_k,
                                                 const double *state_soa);
 int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                 int state_count,
                                                double *device_buffer,
                                                int *ex,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                               double **state_host_key,
                                                               int state_count,
                                                               double *device_buffer,
                                                               int *ex,
                                                               int i0, int j0, int k0,
                                                               int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int i0, int j0, int k0,
                                                    int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int i0, int j0, int k0,
                                                                   int sx, int sy, int sz);
 int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int segment_count,
                                                   const int *segment_meta);
 int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int segment_count,
                                                                  const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta,
                                                                      const double *state_soa);
 int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta);
 int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa);
 int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int sx, int sy, int sz,
                                                    int fi0, int fj0, int fk0);
 int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int sx, int sy, int sz,
                                                                   int fi0, int fj0, int fk0,
                                                                   const double *state_soa);
 int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int ii0, int jj0, int kk0,
                                                   int lbc_i, int lbc_j, int lbc_k);
 int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int ii0, int jj0, int kk0,
                                                                  int lbc_i, int lbc_j, int lbc_k,
                                                                  const double *state_soa);
 int bssn_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,
                                    const int *state_indices,
                                    double **state_host_out);
 int bssn_cuda_upload_state_subset(void *block_tag,
                                  int *ex,
                                  int subset_count,
                                  const int *state_indices,
                                  double **state_host_in);
 int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                       int *ex,
                                       double **src1_host_key,
                                       double **src2_host_key,
                                       double **src3_host_key,
                                       double **dst_host_key,
                                       int source_count,
                                       int tindex);
 int bssn_cuda_has_resident_state(void *block_tag);
 void bssn_cuda_release_step_ctx(void *block_tag);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/AMSS_NCKU_source/bssn_step_gpu.C
+++ b/AMSS_NCKU_source/bssn_step_gpu.C
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -18,7 +18,7 @@ using namespace std;
 #endif
 // Intel oneMKL LAPACK interface
-#include <lapacke.h>
+#include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
--- a/AMSS_NCKU_source/macrodef.fh
+++ b/AMSS_NCKU_source/macrodef.fh
@@ -13,7 +13,7 @@
 #define ABV 0
-#define EScalar_CC 2
+#define EScalar_CC  2
 #if 0
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -10,7 +10,7 @@
 #define GaussInt
-#define ABEtype 0
+#define ABEtype 1
 //#define With_AHF
 #define Psi4type 0
@@ -167,3 +167,4 @@
 #define TINY 1e-10
 #endif   /* MICRODEF_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,208 +1,260 @@
-
+
-
+
-include makefile.inc
+include makefile.inc
-
+
-include AMSS_NCKU_build.mk
+## polint(ordn=6) kernel selector:
-
+##   1 (default): barycentric fast path
-ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
+##   0          : fallback to Neville path
-
+POLINT6_USE_BARY ?= 1
-ifeq ($(USE_TRANSFER_CACHE),auto)
+POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-ifeq ($(ABE_TYPE),0)
+
-EFFECTIVE_USE_TRANSFER_CACHE = 1
+## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
-else
+##   make                        -> opt  (PGO-guided, maximum performance)
-EFFECTIVE_USE_TRANSFER_CACHE = 0
+##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
-endif
+PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
-else
+
-EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
+ifeq ($(TOOLCHAIN),intel)
-endif
+OMP_FLAG = -qopenmp
-
+
-ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
+ifeq ($(PGO_MODE),instrument)
-ifeq ($(ABE_TYPE),1)
+## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
-EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
+CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-else
+              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
+f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-endif
+              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
-else
+else
-EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
+## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
-endif
+## PGO has been turned off, now tested and found to be negative optimization
-
+## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
-ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
+
-ifeq ($(USE_CXX_KERNELS),0)
+
-$(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel)
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-endif
+              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-endif
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-
+              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
-## polint(ordn=6) kernel selector:
+endif
-##   1 (default): barycentric fast path
+
-##   0          : fallback to Neville path
+TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-POLINT6_USE_BARY ?= 1
+              -fprofile-instr-use=$(TP_PROFDATA) \
-POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
+              -Dfortran3 -Dnewc $(MKL_INC)
-TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
+else
-ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
+## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
-
+## PGO_MODE is ignored in this branch.
-## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
+OMP_FLAG = -mp
-CXXAPPFLAGS = -O3 -march=znver4 -ffast-math -flto \
+CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-              -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include $(INTERP_LB_FLAGS) \
+              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
+f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
-f90appflags = -O3 -march=znver4 -ffast-math -flto \
+              $(MKL_INC) $(POLINT6_FLAG)
-              -cpp -I$(AOCL_ROOT)/include $(POLINT6_FLAG)
+TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-
+              -Dfortran3 -Dnewc $(MKL_INC)
-.SUFFIXES: .o .f90 .C .for .cu
+endif
-
+
-.f90.o:
+.SUFFIXES: .o .f90 .C .for .cu
-	$(f90) $(f90appflags) -c $< -o $@
+
-
+.f90.o:
-.C.o:
+	$(f90) $(f90appflags) -c $< -o $@
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
-
+.C.o:
-.for.o:
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-	$(f77) -c $< -o $@
+
-
+.for.o:
 	$(f77) -c $< -o $@
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-# C rewrite of BSSN RHS kernel and helpers
+# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
-bssn_rhs_c.o: bssn_rhs_c.C
+bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-
+
-fderivs_c.o: fderivs_c.C
+# CUDA rewrite of Z4C Cartesian RHS
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h
-
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-fdderivs_c.o: fdderivs_c.C
+
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+# C rewrite of BSSN RHS kernel and helpers
-
+bssn_rhs_c.o: bssn_rhs_c.C
-kodiss_c.o: kodiss_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
-
+fderivs_c.o: fderivs_c.C
-lopsided_c.o: lopsided_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
-
+fdderivs_c.o: fdderivs_c.C
-lopsided_kodis_c.o: lopsided_kodis_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
-
+kodiss_c.o: kodiss_c.C
-#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+
-
+lopsided_c.o: lopsided_c.C
-## TwoPunctureABE uses fixed optimal flags (AMD AOCC, no PGO)
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-TP_OPTFLAGS = -O3 -march=znver4 -ffast-math -flto \
+
-              -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
+lopsided_kodis_c.o: lopsided_kodis_c.C
-
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-TwoPunctures.o: TwoPunctures.C
+
-	${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@
+z4c_rhs_c.o: z4c_rhs_c.C
-
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-TwoPunctureABE.o: TwoPunctureABE.C
+
-	${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@
+#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-
+#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-# Input files
+
-
+TwoPunctures.o: TwoPunctures.C
-## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
+	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
-ifeq ($(USE_CXX_KERNELS),0)
+
-# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
+TwoPunctureABE.o: TwoPunctureABE.C
-CFILES =
+	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
-else
+
-# C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels
+# Input files
-CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
+
-ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
+## CUDA BSSN RHS switch
-CFILES += bssn_escalar_rhs_c.o
+##   1 : use the rewritten CUDA bssn_rhs backend
-endif
+##   0 : keep the normal CPU/Fortran selection below
-endif
+USE_CUDA_BSSN ?= 0
-
+USE_CUDA_Z4C ?= 0
-## RK4 kernel switch (independent from USE_CXX_KERNELS)
+
-ifeq ($(USE_CXX_RK4),1)
+CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
-CFILES += rungekutta4_rout_c.o
+CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
-RK4_F90_OBJ =
+CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
-else
+CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
-RK4_F90_OBJ = rungekutta4_rout.o
+
-endif
+## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
-
+ifeq ($(USE_CXX_KERNELS),0)
-C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
-           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
+CFILES_CPU =
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+else
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+# C++ mode (default): C rewrite of bssn_rhs and helper kernels
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
-	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
+endif
-	   
+
-C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+CFILES_CUDA_BSSN = bssn_rhs_cuda.o
-           cgh.o surface_integral.o ShellPatch.o\
+
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+ifeq ($(USE_CUDA_BSSN),1)
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+CFILES = $(CFILES_CUDA_BSSN)
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+else
-	   NullShellPatch2_Evo.o \
+CFILES = $(CFILES_CPU)
-	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
+endif
-
+
-F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+ifeq ($(USE_CUDA_Z4C),1)
-	   prolongrestrict_cell.o prolongrestrict_vertex.o\
+CFILES += z4c_rhs_cuda.o
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
+Z4C_F90_OBJ =
-	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
+else ifeq ($(USE_CXX_Z4C_KERNELS),1)
-	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
+CFILES += z4c_rhs_c.o
-           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
+Z4C_F90_OBJ =
-           fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
+else
-	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
+Z4C_F90_OBJ = Z4c_rhs.o
-	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
+endif
-	   bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
+
-	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
+## RK4 kernel switch (independent from USE_CXX_KERNELS)
-	   NullNews2.o tool_f.o
+ifeq ($(USE_CXX_RK4),1)
-
+RK4_C_OBJ = rungekutta4_rout_c.o
-ifeq ($(USE_CXX_KERNELS),0)
+RK4_F90_OBJ =
-# Fortran mode: include original bssn_rhs.o
+else
-F90FILES = $(F90FILES_BASE) bssn_rhs.o
+RK4_C_OBJ =
-else
+RK4_F90_OBJ = rungekutta4_rout.o
-# C++ mode (default): bssn_rhs.o replaced by C++ kernel
+endif
-F90FILES = $(F90FILES_BASE)
+
-endif
+CFILES += $(RK4_C_OBJ)
-
+ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) z4c_rhs_cuda.o $(RK4_C_OBJ)
-F77FILES = zbesh.o
+
-
+ABE_LDLIBS = $(LDLIBS)
-AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
+ifeq ($(USE_CUDA_BSSN),1)
-tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \
+ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
-linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \
+endif
-initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
+ifeq ($(USE_CUDA_Z4C),1)
-
+ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
-TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 
+endif
-
+
-CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
+C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-
+           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
-# file dependences
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
+	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
-	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
+	   
-	     rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\
+#C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
+           cgh.o surface_integral.o ShellPatch.o\
-             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-             initial_null2.h NullShellPatch2.h 
+	   NullShellPatch2_Evo.o \
-             
+	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
-$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
+
-	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
+F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
-	     rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
+	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
+	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
-             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
+	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
-	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
+	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
-	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
+           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
-             initial_null2.h NullShellPatch2.h \
+           fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\
-             bssn_gpu_class.h bssn_macro.h
+	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
-             
+	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
-$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
+	   bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
-
+	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
-$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+	   NullNews2.o tool_f.o
-
+
-TwoPunctureFILES: TwoPunctures.h
+ifeq ($(USE_CXX_KERNELS),0)
-
+# Fortran mode: include original bssn_rhs.o
-$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
+F90FILES = $(F90FILES_BASE) bssn_rhs.o
-
+else
-misc.o : zbesh.o
+# C++ mode (default): bssn_rhs.o replaced by C++ kernel
-
+F90FILES = $(F90FILES_BASE)
-# projects
+endif
-ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+F77FILES = zbesh.o
-	
+
-ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \
-
+linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \
-TwoPunctureABE: $(TwoPunctureFILES)
+initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
-	$(CLINKER) $(TP_OPTFLAGS) -fopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+
-
+TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 
-clean:
+
-	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
+#CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
 # file dependences
 $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
 	     rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\
 	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
 	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
 	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
             initial_null2.h NullShellPatch2.h 
 #$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
 	     rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
 	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
 	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
 	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
             initial_null2.h NullShellPatch2.h \
             bssn_gpu_class.h bssn_macro.h
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
 $(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
 TwoPunctureFILES: TwoPunctures.h
 $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o
 # projects
 ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
 ABE_CUDA: USE_CUDA_BSSN=1
 ABE_CUDA: USE_CUDA_Z4C=1
 ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
 #ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 #	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
 	$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,17 +1,12 @@
-## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
+## Toolchain selection
 ##   nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
 ##   intel : Intel oneAPI toolchain (legacy path)
 TOOLCHAIN ?= nvhpc
-## AOCL root path for includes and libraries
+## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
-AOCL_ROOT ?= /home/gh0s7/AOCC/aocl/5.2.0/aocc
+##   opt        : (default) maximum performance with PGO profile-guided optimization
-
+##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
-## AOCC-built OpenMPI prefix
+PGO_MODE ?= opt
 OMPI_PREFIX ?= /home/gh0s7/AOCC/aocc-openmpi
 filein  = -I/usr/include/ -I$(AOCL_ROOT)/include
 ## Using AOCL BLIS + libFLAME for BLAS/LAPACK
 ## AOCC Fortran runtime: -lflang (includes FortranRuntime)
 ## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
 LDLIBS  = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
@@ -27,35 +22,70 @@ else
 INTERP_LB_FLAGS =
 endif
 MKLROOT ?= /home/intel/oneapi/mkl/latest
 MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
 MKL_INC ?= -I$(MKLROOT)/include
 NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
 CUDA_HOME  ?= $(NVHPC_ROOT)/cuda
 CUDA_ARCH  ?= sm_80
 ## Kernel implementation switch
 ##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
 ##   0           : fall back to original Fortran kernels
 USE_CXX_KERNELS ?= 1
-## BSSN-EScalar RHS switch
+## Z4C Cartesian RHS kernel switch
-##   1 (default) : use BSSN-EScalar C wrapper on the normal patch path
+##   1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
-##   0           : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
+##   0           : use original Fortran Z4c_rhs.o
-## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
+USE_CXX_Z4C_KERNELS ?= 1
 USE_CXX_ESCALAR_KERNEL ?= 1
 ## Cached transfer switch
 ##   auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
 ##   1             : force cached Sync/Restrict/OutBd transfer on evolution hot paths
 ##   0             : force the original uncached transfer path
 USE_TRANSFER_CACHE ?= auto
 ## RK4 kernel implementation switch
 ##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
-f90          = flang
+## Memory allocator switch
-f77          = flang
+##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
-CXX          = clang++
+##   0           : use system default allocator (ptmalloc)
-CC           = clang
+USE_TBBMALLOC ?= 1
-CLINKER      = $(OMPI_PREFIX)/bin/mpicxx
+TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
-Cu = nvcc
+ifeq ($(TOOLCHAIN),intel)
-CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
+f90          = ifx
-#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
+f77          = ifx
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 filein  = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
 LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
               -lifcore -limf -liomp5 -lpthread -lm -ldl \
               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
 else ifeq ($(TOOLCHAIN),nvhpc)
 f90          = mpifort
 f77          = mpifort
 CXX          = mpicxx
 CC           = mpicc
 CLINKER      = mpicxx
 filein       = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
 LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
               -lpthread -lm -ldl \
               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
               -fortranlibs
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
 CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -8,10 +8,11 @@
 #include <iostream>
 #include <iomanip>
 #include <fstream>
-#include <strstream>
+#include <strstream>
-#include <cmath>
+#include <cmath>
-#include <map>
+#include <map>
-using namespace std;
+#include <cstdlib>
 using namespace std;
 #else
 #include <iostream.h>
 #include <iomanip.h>
@@ -29,12 +30,26 @@ using namespace std;
 #include "fadmquantites_bssn.h"
 #include "getnpem2.h"
 #include "getnp4.h"
-#include "parameters.h"
+#include "parameters.h"
-
+
-#define PI M_PI
+#define PI M_PI
-//|============================================================================
+
-//| Constructor
+namespace
-//|============================================================================
+{
 bool amss_surface_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_SURFACE_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 }
 //|============================================================================
 //| Constructor
 //|============================================================================
 surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry),
                                                    wave_cache_spinw(-1),
@@ -484,9 +499,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  delete[] IP_out;
  DG_List->clearList();
 }
-void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
+void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
-                                 int spinw, int maxl, int NN, double *RP, double *IP,
+                                 int spinw, int maxl, int NN, double *RP, double *IP,
-                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
+                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
 {
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start surface_integral::surf_Wave");
@@ -720,10 +735,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  delete[] IP_out;
  DG_List->clearList();
 }
-//|----------------------------------------------------------------
+//|----------------------------------------------------------------
-//  for shell patch
+//  for shell patch
-//|----------------------------------------------------------------
+//|----------------------------------------------------------------
-void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4, var *Ipsi4,
+void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
                                 monitor *Monitor) // NN is the length of RP and IP
 {
@@ -3281,6 +3296,8 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
                                         var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
                                         double *Rout, monitor *Monitor, bool refresh_mass_fields)
 {
  const bool timing = amss_surface_timing_enabled();
  const double t_start = timing ? MPI_Wtime() : 0.0;
  if (Symmetry != 0 && Symmetry != 1)
  {
    surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
@@ -3325,6 +3342,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
      Pp = Pp->next;
    }
  }
  const double t_refresh_done = timing ? MPI_Wtime() : 0.0;
  const int InList = 19;
  const int idx_rpsi4 = 0, idx_ipsi4 = 1;
@@ -3380,6 +3398,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
  double *shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  const double t_interp_done = timing ? MPI_Wtime() : 0.0;
  double *RP_out = new double[NN];
  double *IP_out = new double[NN];
@@ -3496,6 +3515,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
    if (Symmetry == 0)
      p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
  }
  const double t_integral_done = timing ? MPI_Wtime() : 0.0;
  for (int ii = 0; ii < NN; ii++)
  {
@@ -3534,6 +3554,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
    delete[] reduce_out;
    delete[] reduce_in;
  }
  const double t_reduce_done = timing ? MPI_Wtime() : 0.0;
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3565,6 +3586,19 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
  Rout[5] = sy;
  Rout[6] = sz;
  if (timing)
  {
    fprintf(stderr,
            "[AMSS-SURFACE][rank %d] rex=%.6g lev=%d refresh=%.6f interp=%.6f integral=%.6f reduce=%.6f total=%.6f nlocal=%d ntotal=%d modes=%d\n",
            myrank, rex, lev,
            t_refresh_done - t_start,
            t_interp_done - t_refresh_done,
            t_integral_done - t_interp_done,
            t_reduce_done - t_integral_done,
            t_reduce_done - t_start,
            Nmax - Nmin + 1, n_tot, NN);
  }
  delete[] pox[0];
  delete[] pox[1];
  delete[] pox[2];
--- a/AMSS_NCKU_source/surface_integral.h
+++ b/AMSS_NCKU_source/surface_integral.h
@@ -46,10 +46,10 @@ public:
 	surface_integral(int iSymmetry);
 	~surface_integral();
-	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
+	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
-				   int spinw, int maxl, int NN, double *RP, double *IP,
+				   int spinw, int maxl, int NN, double *RP, double *IP,
-				   monitor *Monitor); // NN is the length of RP and IP
+				   monitor *Monitor); // NN is the length of RP and IP
-									  // this routine can only deal with the symmetry of Psi4
+									  // this routine can only deal with the symmetry of Psi4
 	void surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4, var *Ipsi4,
 				   int spinw, int maxl, int NN, double *RP, double *IP,
 				   monitor *Monitor);
--- a/AMSS_NCKU_source/z4c_rhs_c.C
+++ b/AMSS_NCKU_source/z4c_rhs_c.C
@@ -0,0 +1,725 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "fmisc.h"
 #include "ricci_gamma.h"
 #include "share_func.h"
 #include "tool.h"
 #include <vector>
 #ifdef fortran1
 #define f_constraint_bssn constraint_bssn
 #define f_z4c_rhs_point z4c_rhs_point
 #endif
 #ifdef fortran2
 #define f_constraint_bssn CONSTRAINT_BSSN
 #define f_z4c_rhs_point Z4C_RHS_POINT
 #endif
 #ifdef fortran3
 #define f_constraint_bssn constraint_bssn_
 #define f_z4c_rhs_point z4c_rhs_point_
 #endif
 extern "C" void f_constraint_bssn(int *, double *, double *, double *,
                                  double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *,
                                  int &);
 extern "C" void f_z4c_rhs_point(
    double &A11,
    double &A12,
    double &A13,
    double &A22,
    double &A23,
    double &A33,
    double &alpha,
    double &B1,
    double &B2,
    double &B3,
    double &beta1,
    double &beta2,
    double &beta3,
    double &chi,
    double &chiDivFloor,
    double &da1,
    double &dA111,
    double &dA112,
    double &dA113,
    double &dA122,
    double &dA123,
    double &dA133,
    double &da2,
    double &dA211,
    double &dA212,
    double &dA213,
    double &dA222,
    double &dA223,
    double &dA233,
    double &da3,
    double &dA311,
    double &dA312,
    double &dA313,
    double &dA322,
    double &dA323,
    double &dA333,
    double &db11,
    double &dB11,
    double &db12,
    double &dB12,
    double &db13,
    double &dB13,
    double &db21,
    double &dB21,
    double &db22,
    double &dB22,
    double &db23,
    double &dB23,
    double &db31,
    double &dB31,
    double &db32,
    double &dB32,
    double &db33,
    double &dB33,
    double &dchi1,
    double &dchi2,
    double &dchi3,
    double &dda11,
    double &dda12,
    double &dda13,
    double &dda22,
    double &dda23,
    double &dda33,
    double &ddb111,
    double &ddb112,
    double &ddb113,
    double &ddb121,
    double &ddb122,
    double &ddb123,
    double &ddb131,
    double &ddb132,
    double &ddb133,
    double &ddb221,
    double &ddb222,
    double &ddb223,
    double &ddb231,
    double &ddb232,
    double &ddb233,
    double &ddb331,
    double &ddb332,
    double &ddb333,
    double &ddchi11,
    double &ddchi12,
    double &ddchi13,
    double &ddchi22,
    double &ddchi23,
    double &ddchi33,
    double &deldelg1111,
    double &deldelg1112,
    double &deldelg1113,
    double &deldelg1122,
    double &deldelg1123,
    double &deldelg1133,
    double &deldelg1211,
    double &deldelg1212,
    double &deldelg1213,
    double &deldelg1222,
    double &deldelg1223,
    double &deldelg1233,
    double &deldelg1311,
    double &deldelg1312,
    double &deldelg1313,
    double &deldelg1322,
    double &deldelg1323,
    double &deldelg1333,
    double &deldelg2211,
    double &deldelg2212,
    double &deldelg2213,
    double &deldelg2222,
    double &deldelg2223,
    double &deldelg2233,
    double &deldelg2311,
    double &deldelg2312,
    double &deldelg2313,
    double &deldelg2322,
    double &deldelg2323,
    double &deldelg2333,
    double &deldelg3311,
    double &deldelg3312,
    double &deldelg3313,
    double &deldelg3322,
    double &deldelg3323,
    double &deldelg3333,
    double &delG11,
    double &delg111,
    double &delg112,
    double &delg113,
    double &delG12,
    double &delg122,
    double &delg123,
    double &delG13,
    double &delg133,
    double &delG21,
    double &delg211,
    double &delg212,
    double &delg213,
    double &delG22,
    double &delg222,
    double &delg223,
    double &delG23,
    double &delg233,
    double &delG31,
    double &delg311,
    double &delg312,
    double &delg313,
    double &delG32,
    double &delg322,
    double &delg323,
    double &delG33,
    double &delg333,
    double &dKhat1,
    double &dKhat2,
    double &dKhat3,
    double &dTheta1,
    double &dTheta2,
    double &dTheta3,
    double &G1,
    double &g11,
    double &g12,
    double &g13,
    double &G2,
    double &g22,
    double &g23,
    double &G3,
    double &g33,
    double &kappa1,
    double &kappa2,
    double &Khat,
    double &rA11,
    double &rA12,
    double &rA13,
    double &rA22,
    double &rA23,
    double &rA33,
    double &rchi,
    double &rG1,
    double &rg11,
    double &rg12,
    double &rg13,
    double &rG2,
    double &rg22,
    double &rg23,
    double &rG3,
    double &rg33,
    double &rKhat,
    double &rTheta,
    double &Theta);
 static inline void z4c_contract_gamma(
    const double gxx, const double gxy, const double gxz,
    const double gyy, const double gyz, const double gzz,
    const double gxxx, const double gxyx, const double gxzx,
    const double gyyx, const double gyzx, const double gzzx,
    const double gxxy, const double gxyy, const double gxzy,
    const double gyyy, const double gyzy, const double gzzy,
    const double gxxz, const double gxyz, const double gxzz,
    const double gyyz, const double gyzz, const double gzzz,
    double &Gamxa, double &Gamya, double &Gamza)
 {
    double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz -
                 gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz;
    const double gupxx = (gyy * gzz - gyz * gyz) / det;
    const double gupxy = -(gxy * gzz - gyz * gxz) / det;
    const double gupxz = (gxy * gyz - gyy * gxz) / det;
    const double gupyy = (gxx * gzz - gxz * gxz) / det;
    const double gupyz = -(gxx * gyz - gxy * gxz) / det;
    const double gupzz = (gxx * gyy - gxy * gxy) / det;
    const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz));
    const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz));
    const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz));
    const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz));
    const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz));
    const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz));
    const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz);
    const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz);
    const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz);
    const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz));
    const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz));
    const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz));
    const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx);
    const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx);
    const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx);
    const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy);
    const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy);
    const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy);
    Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz +
            2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz);
    Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz +
            2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz);
    Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz +
            2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz);
 }
 static int compute_rhs_z4c_cartesian(
    int *ex, double &T, double *X, double *Y, double *Z,
    double *chi_state, double *chi_constraints, double *trK,
    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
    double *Gamx, double *Gamy, double *Gamz,
    double *Lap, double *betax, double *betay, double *betaz,
    double *dtSfx, double *dtSfy, double *dtSfz,
    double *TZ,
    double *chi_rhs, double *trK_rhs,
    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
    double *TZ_rhs,
    double *rho, double *Sx, double *Sy, double *Sz,
    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
    double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
    int &Symmetry, int &Lev, double &eps, int &co)
 {
    (void)T;
    const int nx = ex[0];
    const int ny = ex[1];
    const int nz = ex[2];
    const int all = nx * ny * nz;
    double alpn1[all], chin1[all], gxx[all], gyy[all], gzz[all];
    double chix[all], chiy[all], chiz[all], chixx[all], chixy[all], chixz[all], chiyy[all], chiyz[all], chizz[all];
    double gxxx[all], gxyx[all], gxzx[all], gyyx[all], gyzx[all], gzzx[all];
    double gxxy[all], gxyy[all], gxzy[all], gyyy[all], gyzy[all], gzzy[all];
    double gxxz[all], gxyz[all], gxzz[all], gyyz[all], gyzz[all], gzzz[all];
    double gxxxx[all], gxxxy[all], gxxxz[all], gxxyy[all], gxxyz[all], gxxzz[all];
    double gxyxx[all], gxyxy[all], gxyxz[all], gxyyy[all], gxyyz[all], gxyzz[all];
    double gxzxx[all], gxzxy[all], gxzxz[all], gxzyy[all], gxzyz[all], gxzzz[all];
    double gyyxx[all], gyyxy[all], gyyxz[all], gyyyy[all], gyyyz[all], gyyzz[all];
    double gyzxx[all], gyzxy[all], gyzxz[all], gyzyy[all], gyzyz[all], gyzzz[all];
    double gzzxx[all], gzzxy[all], gzzxz[all], gzzyy[all], gzzyz[all], gzzzz[all];
    double Lapx[all], Lapy[all], Lapz[all], Lapxx[all], Lapxy[all], Lapxz[all], Lapyy[all], Lapyz[all], Lapzz[all];
    double betaxx[all], betaxy[all], betaxz[all], betayx[all], betayy[all], betayz[all], betazx[all], betazy[all], betazz[all];
    double dBxx[all], dBxy[all], dBxz[all], dByx[all], dByy[all], dByz[all], dBzx[all], dBzy[all], dBzz[all];
    double sfxxx[all], sfxxy[all], sfxxz[all], sfxyy[all], sfxyz[all], sfxzz[all];
    double sfyxx[all], sfyxy[all], sfyxz[all], sfyyy[all], sfyyz[all], sfyzz[all];
    double sfzxx[all], sfzxy[all], sfzxz[all], sfzyy[all], sfzyz[all], sfzzz[all];
    double Gamxx[all], Gamxy[all], Gamxz[all], Gamyx[all], Gamyy[all], Gamyz[all], Gamzx[all], Gamzy[all], Gamzz[all];
    double Kx[all], Ky[all], Kz[all], TZx[all], TZy[all], TZz[all];
    double Axxx[all], Axxy[all], Axxz[all], Axyx[all], Axyy[all], Axyz[all];
    double Axzx[all], Axzy[all], Axzz[all], Ayyx[all], Ayyy[all], Ayyz[all];
    double Ayzx[all], Ayzy[all], Ayzz[all], Azzx[all], Azzy[all], Azzz[all];
    const double SSS[3] = {1.0, 1.0, 1.0};
    const double AAS[3] = {-1.0, -1.0, 1.0};
    const double ASA[3] = {-1.0, 1.0, -1.0};
    const double SAA[3] = {1.0, -1.0, -1.0};
    const double ASS[3] = {-1.0, 1.0, 1.0};
    const double SAS[3] = {1.0, -1.0, 1.0};
    const double SSA[3] = {1.0, 1.0, -1.0};
    const double ONE = 1.0;
    const double TWO = 2.0;
    const double ZEO = 0.0;
    double chiDivfloor = 1.0e-5;
    double kappa1 = 2.0e-2;
    double kappa2 = 0.0;
    double FF = 0.75;
    double eta = 2.0;
    for (int idx = 0; idx < all; ++idx)
    {
        alpn1[idx] = Lap[idx] + ONE;
        chin1[idx] = chi_state[idx] + ONE;
        gxx[idx] = dxx[idx] + ONE;
        gyy[idx] = dyy[idx] + ONE;
        gzz[idx] = dzz[idx] + ONE;
    }
    fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dtSfx, dBxx, dBxy, dBxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dtSfy, dByx, dByy, dByz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dtSfz, dBzx, dBzy, dBzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, chi_state, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dxx, gxxxx, gxxxy, gxxxz, gxxyy, gxxyz, gxxzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dyy, gyyxx, gyyxy, gyyxz, gyyyy, gyyyz, gyyzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dzz, gzzxx, gzzxy, gzzxz, gzzyy, gzzyz, gzzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, gxy, gxyxx, gxyxy, gxyxz, gxyyy, gxyyz, gxyzz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, gxz, gxzxx, gxzxy, gxzxz, gxzyy, gxzyz, gxzzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fdderivs(ex, gyz, gyzxx, gyzxy, gyzxz, gyzyy, gyzyz, gyzzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Gamx, Gamxx, Gamxy, Gamxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Gamy, Gamyx, Gamyy, Gamyz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Gamz, Gamzx, Gamzy, Gamzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, trK, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, TZ, TZx, TZy, TZz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betax, sfxxx, sfxxy, sfxxz, sfxyy, sfxyz, sfxzz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betay, sfyxx, sfyxy, sfyxz, sfyyy, sfyyz, sfyzz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betaz, sfzxx, sfzxy, sfzxz, sfzyy, sfzyz, sfzzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fdderivs(ex, chi_state, chixx, chixy, chixz, chiyy, chiyz, chizz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, Lap, Lapxx, Lapxy, Lapxz, Lapyy, Lapyz, Lapzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axx, Axxx, Axxy, Axxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axy, Axyx, Axyy, Axyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axz, Axzx, Axzy, Axzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Ayy, Ayyx, Ayyy, Ayyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Ayz, Ayzx, Ayzy, Ayzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Azz, Azzx, Azzy, Azzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    for (int idx = 0; idx < all; ++idx)
    {
        double point_kappa1 = 0.0;
        f_z4c_rhs_point(
            Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx],
            alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx],
            betax[idx], betay[idx], betaz[idx],
            chin1[idx], chiDivfloor,
            Lapx[idx],
            Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx],
            Lapy[idx],
            Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx],
            Lapz[idx],
            Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx],
            betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx],
            betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx],
            betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx],
            chix[idx], chiy[idx], chiz[idx],
            Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx],
            sfxxx[idx], sfyxx[idx], sfzxx[idx],
            sfxxy[idx], sfyxy[idx], sfzxy[idx],
            sfxxz[idx], sfyxz[idx], sfzxz[idx],
            sfxyy[idx], sfyyy[idx], sfzyy[idx],
            sfxyz[idx], sfyyz[idx], sfzyz[idx],
            sfxzz[idx], sfyzz[idx], sfzzz[idx],
            chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx],
            gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx],
            gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx],
            gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx],
            gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx],
            gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx],
            gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx],
            Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx],
            Gamyx[idx], gyyx[idx], gyzx[idx],
            Gamzx[idx], gzzx[idx],
            Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx],
            Gamyy[idx], gyyy[idx], gyzy[idx],
            Gamzy[idx], gzzy[idx],
            Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx],
            Gamyz[idx], gyyz[idx], gyzz[idx],
            Gamzz[idx], gzzz[idx],
            Kx[idx], Ky[idx], Kz[idx],
            TZx[idx], TZy[idx], TZz[idx],
            Gamx[idx], gxx[idx], gxy[idx], gxz[idx],
            Gamy[idx], gyy[idx], gyz[idx],
            Gamz[idx], gzz[idx],
            point_kappa1, kappa2,
            trK[idx],
            Axx_rhs[idx], Axy_rhs[idx], Axz_rhs[idx], Ayy_rhs[idx], Ayz_rhs[idx], Azz_rhs[idx],
            chi_rhs[idx],
            Gamx_rhs[idx], gxx_rhs[idx], gxy_rhs[idx], gxz_rhs[idx],
            Gamy_rhs[idx], gyy_rhs[idx], gyz_rhs[idx],
            Gamz_rhs[idx], gzz_rhs[idx], trK_rhs[idx], TZ_rhs[idx], TZ[idx]);
    }
    for (int idx = 0; idx < all; ++idx)
        Lap_rhs[idx] = -TWO * alpn1[idx] * trK[idx];
 #if (GAUGE == 0)
    for (int idx = 0; idx < all; ++idx)
    {
        betax_rhs[idx] = FF * dtSfx[idx];
        betay_rhs[idx] = FF * dtSfy[idx];
        betaz_rhs[idx] = FF * dtSfz[idx];
        dtSfx_rhs[idx] = Gamx_rhs[idx] - eta * dtSfx[idx];
        dtSfy_rhs[idx] = Gamy_rhs[idx] - eta * dtSfy[idx];
        dtSfz_rhs[idx] = Gamz_rhs[idx] - eta * dtSfz[idx];
    }
 #elif (GAUGE == 1)
    for (int idx = 0; idx < all; ++idx)
    {
        betax_rhs[idx] = Gamx[idx] - eta * betax[idx];
        betay_rhs[idx] = Gamy[idx] - eta * betay[idx];
        betaz_rhs[idx] = Gamz[idx] - eta * betaz[idx];
        dtSfx_rhs[idx] = ZEO;
        dtSfy_rhs[idx] = ZEO;
        dtSfz_rhs[idx] = ZEO;
    }
 #else
 #error "z4c_rhs_c.C currently supports GAUGE == 0 or GAUGE == 1 for Z4C"
 #endif
    lopsided(ex, X, Y, Z, gxx, gxx_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, gxy, gxy_rhs, betax, betay, betaz, Symmetry, AAS);
    lopsided(ex, X, Y, Z, gxz, gxz_rhs, betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, gyy, gyy_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, gyz, gyz_rhs, betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, gzz, gzz_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Axx, Axx_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Axy, Axy_rhs, betax, betay, betaz, Symmetry, AAS);
    lopsided(ex, X, Y, Z, Axz, Axz_rhs, betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, Ayy, Ayy_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Ayz, Ayz_rhs, betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, Azz, Azz_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, chi_state, chi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, trK, trK_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Gamx, Gamx_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, Gamy, Gamy_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, Gamz, Gamz_rhs, betax, betay, betaz, Symmetry, SSA);
    lopsided(ex, X, Y, Z, Lap, Lap_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, betax, betax_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, betay, betay_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, betaz, betaz_rhs, betax, betay, betaz, Symmetry, SSA);
 #if (GAUGE == 0)
    lopsided(ex, X, Y, Z, dtSfx, dtSfx_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, dtSfy, dtSfy_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, dtSfz, dtSfz_rhs, betax, betay, betaz, Symmetry, SSA);
 #endif
    lopsided(ex, X, Y, Z, TZ, TZ_rhs, betax, betay, betaz, Symmetry, SSS);
    for (int idx = 0; idx < all; ++idx)
    {
        double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0;
        z4c_contract_gamma(
            gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx],
            gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx],
            gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx],
            gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx],
            Gamxa, Gamya, Gamza);
        TZ_rhs[idx] -= alpn1[idx] * (TWO + kappa2) * kappa1 * TZ[idx];
        trK_rhs[idx] += alpn1[idx] * kappa1 * (ONE - kappa2) * TZ[idx];
        Gamx_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamx[idx] - Gamxa);
        Gamy_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamy[idx] - Gamya);
        Gamz_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamz[idx] - Gamza);
    }
    if (eps > 0.0)
    {
        kodis(ex, X, Y, Z, chi_state, chi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, trK, trK_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxx, gxx_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxy, gxy_rhs, AAS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxz, gxz_rhs, ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, gyy, gyy_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gyz, gyz_rhs, SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, gzz, gzz_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axx, Axx_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axy, Axy_rhs, AAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axz, Axz_rhs, ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, Ayy, Ayy_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ayz, Ayz_rhs, SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, Azz, Azz_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamx, Gamx_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamy, Gamy_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamz, Gamz_rhs, SSA, Symmetry, eps);
        kodis(ex, X, Y, Z, Lap, Lap_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, betax, betax_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, betay, betay_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, betaz, betaz_rhs, SSA, Symmetry, eps);
 #if (GAUGE == 0)
        kodis(ex, X, Y, Z, dtSfx, dtSfx_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, dtSfy, dtSfy_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, dtSfz, dtSfz_rhs, SSA, Symmetry, eps);
 #endif
        kodis(ex, X, Y, Z, TZ, TZ_rhs, SSS, Symmetry, eps);
    }
    if (co == 0)
    {
 #if (ABV == 0)
        f_ricci_gamma(ex, X, Y, Z,
                      chi_constraints,
                      dxx, gxy, gxz, dyy, gyz, dzz,
                      Gamx, Gamy, Gamz,
                      Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                      Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                      Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                      Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                      Symmetry);
 #endif
        f_constraint_bssn(ex, X, Y, Z,
                          chi_constraints, trK,
                          dxx, gxy, gxz, dyy, gyz, dzz,
                          Axx, Axy, Axz, Ayy, Ayz, Azz,
                          Gamx, Gamy, Gamz,
                          Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
                          Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                          Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                          Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                          Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                          Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
                          Symmetry);
    }
    return 0;
 }
 extern "C" int f_compute_rhs_Z4c(int *ex, double &T,
                                 double *X, double *Y, double *Z,
                                 double *chi, double *trK,
                                 double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                 double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                 double *Gamx, double *Gamy, double *Gamz,
                                 double *Lap, double *betax, double *betay, double *betaz,
                                 double *dtSfx, double *dtSfy, double *dtSfz,
                                 double *TZ,
                                 double *chi_rhs, double *trK_rhs,
                                 double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                 double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                 double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                 double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                 double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                 double *TZ_rhs,
                                 double *rho, double *Sx, double *Sy, double *Sz,
                                 double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                 double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                 double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                 double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                 double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                 double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
                                 int &Symmetry, int &Lev, double &eps, int &co)
 {
    return compute_rhs_z4c_cartesian(
        ex, T, X, Y, Z,
        chi, chi, trK,
        dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        TZ,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        TZ_rhs,
        rho, Sx, Sy, Sz,
        Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
        Symmetry, Lev, eps, co);
 }
 extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T,
                                    double *X, double *Y, double *Z,
                                    double *chi, double *trK,
                                    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                    double *Gamx, double *Gamy, double *Gamz,
                                    double *Lap, double *betax, double *betay, double *betaz,
                                    double *dtSfx, double *dtSfy, double *dtSfz,
                                    double *TZ,
                                    double *chi_rhs, double *trK_rhs,
                                    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                    double *TZ_rhs,
                                    double *rho, double *Sx, double *Sy, double *Sz,
                                    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                    double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
                                    int &Symmetry, int &Lev, double &eps, int &co, double &chitiny)
 {
    const int all = ex[0] * ex[1] * ex[2];
    std::vector<double> chi_clamped(chi, chi + all);
    f_lowerboundset(ex, chi_clamped.data(), chitiny);
    const int ret = compute_rhs_z4c_cartesian(
        ex, T, X, Y, Z,
        chi_clamped.data(), chi, trK,
        dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        TZ,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        TZ_rhs,
        rho, Sx, Sy, Sz,
        Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
        Symmetry, Lev, eps, co);
    if (ret != 0 || co != 0)
        return ret;
 #if (ABV == 0)
    f_ricci_gamma(ex, X, Y, Z,
                  chi,
                  dxx, gxy, gxz, dyy, gyz, dzz,
                  Gamx, Gamy, Gamz,
                  Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                  Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                  Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                  Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                  Symmetry);
 #endif
    f_constraint_bssn(ex, X, Y, Z,
                      chi, trK,
                      dxx, gxy, gxz, dyy, gyz, dzz,
                      Axx, Axy, Axz, Ayy, Ayz, Azz,
                      Gamx, Gamy, Gamz,
                      Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
                      Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                      Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                      Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                      Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                      Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
                      Symmetry);
    return ret;
 }
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -0,0 +1,258 @@
 #ifndef Z4C_RHS_CUDA_H
 #define Z4C_RHS_CUDA_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum {
    Z4C_CUDA_STATE_COUNT = 25
 };
 int z4c_cuda_rk4_substep(void *block_tag,
                         int *ex, double *X, double *Y, double *Z,
                         double **state_host_in,
                         double **state_host_out,
                         const double *propspeed,
                         const double *soa_flat,
                         const double *bbox,
                         double &dT,
                         double &T,
                         int &RK4,
                         int &apply_bam_bc,
                         int &Symmetry,
                         int &Lev,
                         double &eps,
                         int &co,
                         int &keep_resident_state,
                         int &apply_enforce_ga,
                         double &chitiny);
 int z4c_cuda_download_resident_state(void *block_tag,
                                     int *ex,
                                     double **state_host_out);
 int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                              int state_index,
                                              double *host_buffer,
                                              int *ex,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz);
 int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                  int state_index,
                                                  double *host_buffer,
                                                  int *ex,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                             int state_count,
                                             double *host_buffer,
                                             int *ex,
                                             int i0, int j0, int k0,
                                             int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                            double **state_host_key,
                                                            int state_count,
                                                            double *host_buffer,
                                                            int *ex,
                                                            int i0, int j0, int k0,
                                                            int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                 int state_count,
                                                 double *host_buffer,
                                                 int *ex,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                double **state_host_key,
                                                                int state_count,
                                                                double *host_buffer,
                                                                int *ex,
                                                                int i0, int j0, int k0,
                                                                int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                               int state_count,
                                               double *device_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                              double **state_host_key,
                                                              int state_count,
                                                              double *device_buffer,
                                                              int *ex,
                                                              int i0, int j0, int k0,
                                                              int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int i0, int j0, int k0,
                                                                  int sx, int sy, int sz);
 int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                  int state_count,
                                                  double *device_buffer,
                                                  int *ex,
                                                  int segment_count,
                                                  const int *segment_meta);
 int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *device_buffer,
                                                                 int *ex,
                                                                 int segment_count,
                                                                 const int *segment_meta);
 int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta);
 int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta);
 int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta,
                                                      const double *state_soa);
 int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa);
 int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                     int state_count,
                                                     double *device_buffer,
                                                     int *ex,
                                                     int segment_count,
                                                     const int *segment_meta,
                                                     const double *state_soa);
 int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                    double **state_host_key,
                                                                    int state_count,
                                                                    double *device_buffer,
                                                                    int *ex,
                                                                    int segment_count,
                                                                    const int *segment_meta,
                                                                    const double *state_soa);
 int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int fi0, int fj0, int fk0,
                                                   const double *state_soa);
 int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int fi0, int fj0, int fk0,
                                                                  const double *state_soa);
 int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                  int state_count,
                                                  double *device_buffer,
                                                  int *ex,
                                                  int sx, int sy, int sz,
                                                  int ii0, int jj0, int kk0,
                                                  int lbc_i, int lbc_j, int lbc_k,
                                                  const double *state_soa);
 int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *device_buffer,
                                                                 int *ex,
                                                                 int sx, int sy, int sz,
                                                                 int ii0, int jj0, int kk0,
                                                                 int lbc_i, int lbc_j, int lbc_k,
                                                                 const double *state_soa);
 int z4c_cuda_download_state_subset(void *block_tag,
                                   int *ex,
                                   int subset_count,
                                   const int *state_indices,
                                   double **state_host_out);
 int z4c_cuda_upload_state_subset(void *block_tag,
                                 int *ex,
                                 int subset_count,
                                 const int *state_indices,
                                 double **state_host_in);
 int z4c_cuda_compute_constraints_resident(void *block_tag,
                                          int *ex, double *X, double *Y, double *Z,
                                          int Symmetry, double eps, int co,
                                          double **constraint_host_out);
 int z4c_cuda_interp_state_point3(void *block_tag,
                                 int *ex,
                                 int state0,
                                 int state1,
                                 int state2,
                                 double x0,
                                 double y0,
                                 double z0,
                                 double dx,
                                 double dy,
                                 double dz,
                                 double px,
                                 double py,
                                 double pz,
                                 int ordn,
                                 int symmetry,
                                 const double *soa3,
                                 double *out3);
 int z4c_cuda_download_constraint_outputs(int *ex,
                                         double **constraint_host_out);
 int z4c_cuda_has_resident_state(void *block_tag);
 int z4c_cuda_resident_state_matches(void *block_tag,
                                    double **state_host_key);
 void z4c_cuda_release_step_ctx(void *block_tag);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/BSSN_BUILD_CONFIG_MIGRATION.md
+++ b/BSSN_BUILD_CONFIG_MIGRATION.md
@@ -1,211 +0,0 @@
 # BSSN Build Config Migration
 This note records the build-configuration fix needed when replacing
 `AMSS_NCKU_Input.py` or `generate_macrodef.py` with a newer upstream version.
 ## Problem
 `AMSS_NCKU_source/macrodef.h` is not the authoritative file used by normal
 runs. `AMSS_NCKU_Program.py` first generates macro files under
 `input_data.File_directory`, copies `AMSS_NCKU_source` to
 `<File_directory>/AMSS_NCKU_source_copy`, then copies the generated macro files
 into that copied source tree and compiles there.
 Therefore, makefile logic must not depend only on the stale
 `AMSS_NCKU_source/macrodef.h`. The actual equation path must be passed to the
 copied build tree from the same generation step that creates `macrodef.h`.
 The performance regression was caused by compiling/linking the
 `BSSN-EScalar` C wrapper into BSSN vacuum builds. For BSSN vacuum (`ABEtype=0`),
 the build must use:
 ```make
 BSSN_USE_TRANSFER_CACHE=1
 BSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and must not link `bssn_escalar_rhs_c.o`.
 ## Required Migration Steps
 ### 1. Add an ABE type helper in `generate_macrodef.py`
 Add a helper that maps `input_data.Equation_Class` to the numeric `ABEtype`.
 Use the same mapping as `macrodef.h`:
 ```python
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ```
 Update `generate_macrodef_h()` to print `#define ABEtype {get_abe_type()}`
 instead of duplicating the if/elif mapping.
 ### 2. Generate a makefile fragment
 In `generate_macrodef.py`, add:
 ```python
 def generate_build_config():
    file1 = open(os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print("# Generated by generate_macrodef.py; do not edit manually.", file=file1)
    print(f"ABE_TYPE := {get_abe_type()}", file=file1)
    file1.close()
 ```
 This file is the build-time authority for the equation path.
 ### 3. Call and copy the generated build config
 In `AMSS_NCKU_Program.py`, after generating `macrodef.h` and `macrodef.fh`, call:
 ```python
 generate_macrodef.generate_build_config()
 print(" AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. ")
 ```
 When copying generated files into `AMSS_NCKU_source_copy`, also copy:
 ```python
 build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 ```
 ### 4. Make the source makefile consume the generated config
 At the top of `AMSS_NCKU_source/makefile`, after `include makefile.inc`, add:
 ```make
 -include AMSS_NCKU_build.mk
 ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
 ```
 The generated `AMSS_NCKU_build.mk` is used during normal Python-driven builds.
 The fallback keeps manual source-tree builds usable.
 ### 5. Gate path-specific build options by `ABE_TYPE`
 Use effective build switches:
 ```make
 ifeq ($(USE_TRANSFER_CACHE),auto)
 ifeq ($(ABE_TYPE),0)
 EFFECTIVE_USE_TRANSFER_CACHE = 1
 else
 EFFECTIVE_USE_TRANSFER_CACHE = 0
 endif
 else
 EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
 endif
 ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(ABE_TYPE),1)
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
 ```
 Only add `bssn_escalar_rhs_c.o` when the effective EScalar C kernel switch is
 enabled:
 ```make
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 CFILES += bssn_escalar_rhs_c.o
 endif
 ```
 ### 6. Use safe transfer-cache default
 In `AMSS_NCKU_source/makefile.inc`, keep:
 ```make
 USE_TRANSFER_CACHE ?= auto
 ```
 With the effective switch logic above, this enables cached transfer for BSSN
 vacuum while keeping non-BSSN paths on the uncached path by default.
 ## Verification Checklist
 Run these checks after migrating:
 ```bash
 python3 -c "import generate_macrodef; generate_macrodef.generate_build_config()"
 cat GW150914/AMSS_NCKU_build.mk
 ```
 For BSSN, the generated file should contain:
 ```make
 ABE_TYPE := 0
 ```
 Dry-run the copied or source makefile:
 ```bash
 make -n -B INTERP_LB_MODE=off ABE | grep -E 'BSSN_USE_TRANSFER_CACHE|BSSN_USE_ESCALAR_C_KERNEL|bssn_escalar_rhs_c'
 ```
 Expected BSSN result:
 ```text
 -DBSSN_USE_TRANSFER_CACHE=1 -DBSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and no `bssn_escalar_rhs_c.o` in the final link command.
 Run the full workflow:
 ```bash
 python3 AMSS_NCKU_Program.py
 ```
 For the 10-step BSSN test, compare coordinate output:
 ```bash
 python3 - <<'PY'
 from pathlib import Path
 old = Path('../GW150914-06457/AMSS_NCKU_output/bssn_BH.dat')
 new = Path('GW150914/AMSS_NCKU_output/bssn_BH.dat')
 def rows(path):
    out = []
    for line in path.read_text().splitlines():
        if not line.strip() or line.lstrip().startswith('#'):
            continue
        out.append([float(x) for x in line.split()])
    return out
 ro, rn = rows(old), rows(new)
 n = min(len(ro), len(rn))
 max_abs = 0.0
 for i in range(n):
    for a, b in zip(ro[i], rn[i]):
        max_abs = max(max_abs, abs(a - b))
 print(f"old_rows={len(ro)} new_rows={len(rn)} compared_rows={n}")
 print(f"max_abs_diff={max_abs:.17g}")
 PY
 ```
 For the validated migration, the first 10 rows matched exactly:
 ```text
 max_abs_diff=0
 ```
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -12,37 +12,6 @@ import os
 import AMSS_NCKU_Input as input_data          ## import program input file
 ##################################################################
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ##################################################################
 ## Generate the makefile fragment used by the copied source tree.
 ## The source-tree macrodef.h is not authoritative because macro files
 ## are regenerated under File_directory for each run.
 def generate_build_config():
    file1 = open( os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print( "# Generated by generate_macrodef.py; do not edit manually.", file=file1 )
    print( f"ABE_TYPE := {get_abe_type()}",                             file=file1 )
    file1.close()
 ##################################################################
 ## Generate the macro file macrodef.h according to user settings
@@ -89,10 +58,19 @@ def generate_macrodef_h():
    # 2: Z4c vacuum
    # 3: coupled to Maxwell field
-    try:
+    if ( input_data.Equation_Class == "BSSN" ):
-        print( f"#define ABEtype {get_abe_type()}", file=file1 )
+        print( "#define ABEtype 0", file=file1 )
-        print(                                      file=file1 )
+        print(                      file=file1 )
-    except ValueError:
+    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        print( "#define ABEtype 1", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        print( "#define ABEtype 3", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "Z4C" ):
        print( "#define ABEtype 2", file=file1 )
        print(                      file=file1 )
    else:
        print( "Equation_Class setting error!!!"                )
        print()
        print( "# Equation type #define ABEtype setting error!!!", file=file1 )
@@ -226,7 +204,7 @@ def generate_macrodef_h():
    # use GPU or not
    if ( input_data.GPU_Calculation == "yes"):
-        print( "#define USE_GPU",   file=file1 )
+        print( "//#define USE_GPU",   file=file1 )
        print(                      file=file1 )
    elif ( input_data.GPU_Calculation == "no"):
        print( "//#define USE_GPU", file=file1 )
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,8 @@
 import AMSS_NCKU_Input as input_data
 import os
 import shutil
 import subprocess
 import time
@@ -56,6 +58,124 @@ BUILD_JOBS = 64
 ##################################################################
 def _truthy(value, default=False):
    if value is None:
        return default
    if isinstance(value, bool):
        return value
    text = str(value).strip().lower()
    if text == "":
        return default
    return text in ("1", "yes", "y", "true", "on", "enable", "enabled")
 def _input_or_env(input_name, env_name, default=None):
    if env_name in os.environ:
        return os.environ[env_name]
    return getattr(input_data, input_name, default)
 def _start_cuda_mps_if_requested(runtime_env):
    if input_data.GPU_Calculation != "yes":
        return False
    default_auto_mps = int(getattr(input_data, "MPI_processes", 1)) > 1
    auto_mps = _truthy(
        _input_or_env("CUDA_Auto_MPS", "AMSS_CUDA_AUTO_MPS", default_auto_mps),
        default=default_auto_mps,
    )
    if not auto_mps:
        return False
    mps_control = shutil.which("nvidia-cuda-mps-control")
    if not mps_control:
        print(" CUDA MPS control command was not found; running without MPS.")
        return False
    uid = os.getuid()
    pipe_dir = str(_input_or_env("CUDA_MPS_PIPE_DIRECTORY", "CUDA_MPS_PIPE_DIRECTORY",
                                 f"/tmp/amss-ncku-mps-{uid}"))
    log_dir = str(_input_or_env("CUDA_MPS_LOG_DIRECTORY", "CUDA_MPS_LOG_DIRECTORY",
                                f"/tmp/amss-ncku-mps-log-{uid}"))
    os.makedirs(pipe_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    mps_env = runtime_env.copy()
    mps_env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
    mps_env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
    if os.path.exists(os.path.join(pipe_dir, "control")):
        runtime_env.update({
            "CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
            "CUDA_MPS_LOG_DIRECTORY": log_dir,
        })
        print(f" Reusing CUDA MPS daemon: {pipe_dir}")
        return False
    print(f" Starting CUDA MPS daemon for this run: {pipe_dir}")
    result = subprocess.run([mps_control, "-d"], env=mps_env, text=True,
                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    if result.returncode != 0:
        print(" CUDA MPS daemon did not start; running without MPS.")
        if result.stdout:
            print(result.stdout, end="")
        return False
    runtime_env.update({
        "CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
        "CUDA_MPS_LOG_DIRECTORY": log_dir,
    })
    return True
 def _stop_cuda_mps(runtime_env):
    mps_control = shutil.which("nvidia-cuda-mps-control")
    if not mps_control:
        return
    subprocess.run([mps_control], input="quit\n", env=runtime_env, text=True,
                   stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 def _gpu_runtime_env():
    runtime_env = os.environ.copy()
    defaults = {
        "AMSS_INTERP_FAST": "1",
        "AMSS_INTERP_GPU": "1",
        "AMSS_ANALYSIS_MAP_EVERY": "1000000",
        "AMSS_CUDA_AWARE_MPI": "1",
        "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
        "AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP": "1",
        "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
        "AMSS_CUDA_Z4C_AMR_DEVICE": "0",
        "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
        "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
        "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
        "AMSS_CUDA_PIN_ESCALAR_TRANSFERS": "0",
        "AMSS_ESCALAR_GPU_RK": "0",
    }
    if getattr(input_data, "Equation_Class", "") == "Z4C":
        defaults["AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"] = "0"
        defaults["AMSS_CUDA_KEEP_ALL_LEVELS"] = "0"
    for key, value in defaults.items():
        runtime_env.setdefault(key, value)
    optional_overrides = {
        "AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare",
        "AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit",
        "AMSS_INTERP_FAST_COMPARE_TOL": "AMSS_Interp_Fast_Compare_Tol",
        "AMSS_GPU_STAGE_TIMING": "AMSS_GPU_Stage_Timing",
        "AMSS_GPU_STAGE_TIMING_EVERY": "AMSS_GPU_Stage_Timing_Every",
    }
    for env_name, input_name in optional_overrides.items():
        if env_name not in runtime_env and hasattr(input_data, input_name):
            runtime_env[env_name] = str(getattr(input_data, input_name))
    return runtime_env
 ##################################################################
 ##################################################################
@@ -70,9 +190,9 @@ def makefile_ABE():
    ## Build command with CPU binding to nohz_full cores
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -145,29 +265,55 @@ def run_ABE():
    print(                                                      )
    ## Define the command to run; cast other values to strings as needed
    mpi_env = None
    started_mps = False
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
        mpi_command_outfile = "ABEGPU_out.log"
        mpi_env = _gpu_runtime_env()
        started_mps = _start_cuda_mps_if_requested(mpi_env)
        print(" GPU optimized runtime switches:")
        print(f"   AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}")
        print(f"   AMSS_INTERP_GPU={mpi_env.get('AMSS_INTERP_GPU', '')}")
        print(f"   AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}")
        print(f"   AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
        print(f"   AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
        print(f"   AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP', '')}")
        print(f"   AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
        print(f"   AMSS_CUDA_Z4C_AMR_DEVICE={mpi_env.get('AMSS_CUDA_Z4C_AMR_DEVICE', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
        print(f"   AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
        print(f"   AMSS_CUDA_PIN_ESCALAR_TRANSFERS={mpi_env.get('AMSS_CUDA_PIN_ESCALAR_TRANSFERS', '')}")
        print(f"   AMSS_ESCALAR_GPU_RK={mpi_env.get('AMSS_ESCALAR_GPU_RK', '')}")
        if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
            print(f"   CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")
-    ## Execute the MPI command and stream output
+    try:
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+        ## Execute the MPI command and stream output
        mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT, text=True, env=mpi_env)
-    ## Write ABE run output to file while printing to stdout
+        ## Write ABE run output to file while printing to stdout
-    with open(mpi_command_outfile, 'w') as file0:  
+        with open(mpi_command_outfile, 'w') as file0:
-        ## Read and print output lines; also write each line to file
+            ## Read and print output lines; also write each line to file
-        for line in mpi_process.stdout:
+            for line in mpi_process.stdout:
-            print(line, end='')  # stream output in real time
+                print(line, end='')  # stream output in real time
-            file0.write(line)    # write the line to file
+                file0.write(line)    # write the line to file
-            file0.flush()        # flush to ensure each line is written immediately (optional)            
+                file0.flush()        # flush to ensure each line is written immediately (optional)
    file0.close()
-    ## Wait for the process to finish
+        ## Wait for the process to finish
-    mpi_return_code = mpi_process.wait()
+        mpi_return_code = mpi_process.wait()
        if mpi_return_code != 0:
            raise subprocess.CalledProcessError(mpi_return_code, mpi_command)
    finally:
        if started_mps:
            _stop_cuda_mps(mpi_env)
    print(                                           )
    print( " The ABE/ABEGPU simulation is finished " )
Author	SHA1	Message	Date
CGH0S7	e4c10eca0f	Stabilize EScalar CUDA fallback path	2026-05-03 16:05:47 +08:00
CGH0S7	4430d04ee7	Stabilize EScalar CUDA sync defaults	2026-05-03 00:24:50 +08:00
CGH0S7	74ba5feb86	Pin EScalar scalar CUDA transfers	2026-05-02 19:21:57 +08:00
CGH0S7	6f28111a43	Keep EScalar mixed GPU RP opt-in	2026-05-02 18:38:43 +08:00
CGH0S7	f638cbc4e8	Add mixed GPU RP path for EScalar	2026-05-02 18:27:26 +08:00
CGH0S7	59a216ad93	Optimize BSSN EScalar GPU path baseline	2026-05-02 18:19:15 +08:00
CGH0S7	52beb4d153	Checkpoint Z4C CUDA resident sync progress	2026-05-02 10:53:52 +08:00
CGH0S7	ba61702fc0	Checkpoint Z4C CUDA throttling progress	2026-05-02 10:04:23 +08:00
CGH0S7	fcd98649f6	Checkpoint Z4C CUDA optimization progress	2026-05-02 08:55:25 +08:00
CGH0S7	a5c8188305	Disable unsafe Z4C AMR device path by default	2026-05-02 01:36:41 +08:00
CGH0S7	383e936e88	Save Z4C CUDA optimization progress	2026-05-02 00:49:02 +08:00
CGH0S7	531b31e8db	Stabilize cached Z4C CUDA sync after regrid	2026-05-01 20:04:04 +08:00
CGH0S7	30b778daa3	Save Z4C CUDA transfer progress	2026-05-01 18:51:19 +08:00
CGH0S7	db9383e439	Initialize cached sync runtime in derived evolvers	2026-05-01 18:34:43 +08:00
CGH0S7	35b6ceff02	Broaden cached CUDA sync paths	2026-05-01 18:03:04 +08:00
CGH0S7	51f3819892	Save generated source formatting state	2026-04-30 20:47:44 +08:00
CGH0S7	a9a3809148	Default Python launcher to fast GPU path	2026-04-30 20:15:34 +08:00
CGH0S7	b1974ef146	Stabilize device AMR restrict across regrid	2026-04-30 20:01:18 +08:00
CGH0S7	be9033f449	Add optional CUDA surface interpolation	2026-04-30 19:21:19 +08:00
CGH0S7	6835608f92	Add configurable analysis MAP cadence	2026-04-30 19:10:12 +08:00
CGH0S7	e0d0673c8e	Enable optimized GPU runs from Python launcher	2026-04-30 18:31:31 +08:00
CGH0S7	da4d56ccf7	Optimize BSSN surface interpolation fast path	2026-04-30 18:25:21 +08:00
CGH0S7	a6483d013d	Add CUDA AMR restrict diagnostics	2026-04-30 12:20:44 +08:00
CGH0S7	8486532920	Add resident BSSN GPU point interpolation	2026-04-30 11:39:15 +08:00
CGH0S7	18e9c9cc50	Optimize BSSN CUDA resident AMR prolong path	2026-04-30 10:58:15 +08:00
CGH0S7	1ee229a91f	Add keyed BSSN CUDA resident banks	2026-04-29 19:44:19 +08:00
CGH0S7	68eab03bac	Add opt-in BSSN CUDA resident AMR path	2026-04-29 19:15:37 +08:00
CGH0S7	090d8657ae	Optimize BSSN CUDA state transfers	2026-04-29 18:34:31 +08:00
CGH0S7	22c1e7168b	Optimize BSSN CUDA resident state and CUDA-aware MPI	2026-04-29 17:05:10 +08:00
ianchb	a0dab90bcb	Switch to NVIDIA HPC Toolchain	2026-04-29 08:31:49 +08:00
ianchb	c689cc8dc9	[WIP] Add CUDA support for Z4C Rewritten done by Codex. This still has errors, do not pick this one now.	2026-04-27 11:58:43 +08:00
ianchb	60fee8f1c1	Fix Z4C C++ gauge damping ordering	2026-04-26 15:38:13 +08:00
ianchb	843b116954	Add C++ Z4C RHS path and port some BSSN optimizations	2026-04-25 10:39:01 +08:00
ianchb	c768e1220b	Also disable cached sync for Z4C	2026-04-25 10:25:54 +08:00
CGH0S7	02f149e2e3	Disable cached sync for BSSN-EScalar	2026-04-25 10:17:47 +08:00
CGH0S7	422e8ec4dc	Fallback BSSN-EScalar restrict/prolong path	2026-04-25 10:10:34 +08:00
CGH0S7	c4909b9843	更新精度检查脚本加入图像比对检查 (cherry picked from commit `ac82ebd889`)	2026-04-25 09:40:12 +08:00
ianchb	f521a97563	Fix ABE CPU version build error	2026-04-25 09:39:49 +08:00
ianchb	53c55451b3	Update makefile and scripts for CUDA BSSN configuration and build commands	2026-04-25 09:19:50 +08:00
CGH0S7	768345954f	Add optional BSSN kernel profiling switches (cherry picked from commit `9c31384b2f`)	2026-04-25 08:39:43 +08:00
CGH0S7	9a6df6438b	Remove dead chi derivative setup in BSSN RHS (cherry picked from commit `e4e741caa1`)	2026-04-25 08:38:01 +08:00
CGH0S7	8e9463aa90	Localize chi Ricci intermediates in RHS (cherry picked from commit `65e0f95f40`)	2026-04-25 08:37:41 +08:00
CGH0S7	7c6f15002e	Elide dead stores in BSSN RHS hot path (cherry picked from commit `f9fbf97e64`)	2026-04-25 08:37:40 +08:00
CGH0S7	6410c62e3e	Add fine-grained step timing and trim BH RHS overhead (cherry picked from commit `968522995b`)	2026-04-25 08:37:19 +08:00
CGH0S7	11977eb82f	Merge wave and mass extraction interpolation (cherry picked from commit `f3988ac8ca`)	2026-04-25 08:25:34 +08:00
CGH0S7	cce8a44fc4	Cache wave extraction angular kernels (cherry picked from commit `e4c25eb21f`)	2026-04-25 08:24:36 +08:00
CGH0S7	c589097618	Reuse mass integrand across detector radii (cherry picked from commit `4b10519876`)	2026-04-25 08:24:11 +08:00
CGH0S7	b713e5a9be	Batch constraint norm reductions (cherry picked from commit `3a58273501`)	2026-04-25 08:22:00 +08:00
CGH0S7	0396701572	Optimize constraint refresh after regrid (cherry picked from commit `5c65cea2f0`)	2026-04-25 08:18:51 +08:00
ianchb	bb20c9a876	fix ADM Constrant Violation Analysis	2026-04-15 19:19:16 +08:00
ianchb	8fe60ea703	Add zero matter handling and interpolation for resident state in CUDA BSSN	2026-04-15 00:25:53 +08:00
ianchb	9ab7e7c7f9	Fuse phases 5 and 6 for Gamma_rhs computation and optimize phases 8 and 9 for efficiency	2026-04-14 23:23:04 +08:00
ianchb	f9119e8a2a	Add resident-GA mode switch and simplify sync logic	2026-04-14 21:09:27 +08:00
ianchb	726d743376	Fuse Ricci assembly and optimize trK/Aij gauge kernels	2026-04-14 19:20:12 +08:00
ianchb	af344bf1e5	Add Phase-10 Ricci kernels and batch launch flow	2026-04-14 19:00:22 +08:00
ianchb	7191fc0b96	Move resident sync comm buffers into StepAllocation pool	2026-04-13 21:04:44 +08:00
ianchb	b3ec244cf9	Add batched first/second derivative kernels for CUDA RHS	2026-04-13 20:51:08 +08:00
ianchb	e952ee8e91	Batch GA/BH subset sync with indexed GPU pack/unpack buffers	2026-04-13 20:40:09 +08:00
ianchb	c5d1268dd1	Batch patch-boundary copy and gate CPU BC in GPU substeps	2026-04-13 11:52:17 +08:00
ianchb	4bdfc90f22	Pass pointer tables as kernel args and skip redundant symbol uploads	2026-04-13 11:19:00 +08:00
ianchb	c49a4e00c9	Batch symbd_pack/lopsided/kodiss over all state variables	2026-04-13 11:02:55 +08:00
ianchb	1b3c0b80d2	Refactor CUDA step buffers to remove loop-time allocations	2026-04-13 10:33:03 +08:00
ianchb	636e35bfd8	Add direct CUDA resident-state sync path and profiling hooks	2026-04-13 00:57:05 +08:00
ianchb	7f2a391dd2	Cache matter fields in StepContext across RK4 substeps	2026-04-12 22:19:45 +08:00
ianchb	4fa12a2009	Integrate CUDA support into RK4 substep execution	2026-04-12 22:11:44 +08:00
ianchb	86a683de26	Replace legacy ABEGPU stack with ABE_CUDA backend	2026-04-12 21:19:14 +08:00
ianchb	aaf7bf0a26	Merge remote-tracking branch 'origin/main'	2026-04-12 20:55:42 +08:00
ianchb	9c44d1c885	fix(bssn_rhs)	2026-03-03 16:00:45 +08:00
ianchb	4b9de28feb	将 Restrict/Prolong 链路里的 coarse-level Sync_cached 改为可选（默认跳过） OutBdLow2Hi_cached 读的是 coarse owned 区域（非 coarse ghost/buffer）回退旧行为：编译时定义 RP_SYNC_COARSE_AFTER_RESTRICT=1	2026-03-03 14:25:27 +08:00
ianchb	4eb5dc4ddb	删除重复的一次 chi 一阶导计算	2026-03-03 14:23:56 +08:00