Use cudaMemcpyAsync with dedicated transfer stream for H2D/D2H transfers

Add cudaStream_t to GpuBuffers for async H2D/D2H transfers in BSSN and Z4C substep functions. Adds cudaStreamSynchronize(0) before D2H to enforce kernel/transfer ordering across streams, and a sync between state and matter H2D uploads to prevent h_stage race on RK4==0. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-28 08:23:34 +08:00
32 changed files with 11828 additions and 24106 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 2                             ## number of mpi processes used in the simulation
+MPI_processes    = 8                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "yes"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
@@ -158,7 +158,7 @@ Detector_Rmax   = 160.0                  ## farest dector distance
 ## Setting the apprent horizon
-AHF_Find       = "yes"                    ## whether to find the apparent horizon: choose "yes" or "no"
+AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"
 AHF_Find_Every = 24
 AHF_Dump_Time  = 20.0
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -58,36 +58,31 @@ File_directory = os.path.join(input_data.File_directory)
 ## If the specified output directory exists, ask the user whether to continue
 if os.path.exists(File_directory):
-    auto_overwrite = str(getattr(input_data, "Auto_Overwrite_Output", "yes")).strip().lower()
+    print( " Output dictionary has been existed !!!  "                                                              )
-    if auto_overwrite in ("1", "yes", "y", "true", "on", "continue"):
+    print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
-        print( " Output dictionary has been existed; Auto_Overwrite_Output=yes, continue the calculation. " )
+    print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
-        print(                                                                                              )
+    print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
-    else:
+    print(                                                                                                          )
-        print( " Output dictionary has been existed !!!  "                                                              )
+    ## Prompt whether to overwrite the existing directory
-        print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
+    while True:
-        print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
+        try:
-        print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
+            inputvalue = input()
-        print(                                                                                                          )
+            ## If the user agrees to overwrite, proceed and remove the existing directory
-        ## Prompt whether to overwrite the existing directory
+            if ( inputvalue == "continue" ):
-        while True:
+                print( " Continue the calculation !!! " )
-            try:
+                print(                                  )
-                inputvalue = input()
+                break  
-                ## If the user agrees to overwrite, proceed and remove the existing directory
+            ## If the user chooses not to overwrite, exit and keep the existing directory
-                if ( inputvalue == "continue" ):
+            elif ( inputvalue == "stop" ):
-                    print( " Continue the calculation !!! " )
+                print( " Stop the calculation !!! "    )
-                    print(                                  )
+                sys.exit() 
-                    break  
+            ## If the user input is invalid, prompt again
-                ## If the user chooses not to overwrite, exit and keep the existing directory
+            else:
                elif ( inputvalue == "stop" ):
                    print( " Stop the calculation !!! "    )
                    sys.exit() 
                ## If the user input is invalid, prompt again
                else:
                    print( " Please input your choice !!! "                   )
                    print( " Input 'continue' or 'stop' in the terminal !!! " )
            except ValueError:
                print( " Please input your choice !!! "                   )
                print( " Input 'continue' or 'stop' in the terminal !!! " )
        except ValueError:
            print( " Please input your choice !!! "                   )
            print( " Input 'continue' or 'stop' in the terminal !!! " )
 ## Remove the existing output directory if present
 shutil.rmtree(File_directory, ignore_errors=True)
--- a/AMSS_NCKU_Program_Plot.py
+++ b/AMSS_NCKU_Program_Plot.py
@@ -1,100 +0,0 @@
 ##################################################################
 ##
 ## AMSS-NCKU Plot-Only Restart Script
 ## Author: Xiaoqu / Claude
 ## 2026/05/12
 ##
 ## This script checks for existing output data from AMSS_NCKU_Program.py.
 ## If data exists, it skips all computation and goes directly to plotting,
 ## saving time when plotting was interrupted.
 ## If no data is found, it exits with a message.
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 import os
 import sys
 import AMSS_NCKU_Input as input_data
 ##################################################################
 ## Construct paths from input configuration
 File_directory         = os.path.join(input_data.File_directory)
 output_directory       = os.path.join(File_directory, "AMSS_NCKU_output")
 binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
 figure_directory       = os.path.join(File_directory, "figure")
 ##################################################################
 ## Check whether the required output data files exist
 required_files = [
    os.path.join(binary_results_directory, "bssn_BH.dat"),
    os.path.join(binary_results_directory, "bssn_ADMQs.dat"),
    os.path.join(binary_results_directory, "bssn_psi4.dat"),
    os.path.join(binary_results_directory, "bssn_constraint.dat"),
 ]
 missing_files = [f for f in required_files if not os.path.exists(f)]
 if missing_files:
    print(" No existing AMSS_NCKU_Program.py output data found. ")
    print(" The following required files are missing: ")
    for f in missing_files:
        print(f"   {f}")
    print()
    print(" Please run AMSS_NCKU_Program.py first to generate the simulation data. ")
    print(" Exiting. ")
    sys.exit(1)
 print(" Found existing AMSS_NCKU_Program.py output data. "          )
 print(" Skipping all computation and going directly to plotting. " )
 print()
 ## Ensure the figure directory exists (it should, but be safe)
 os.makedirs(figure_directory, exist_ok=True)
 ##################################################################
 ## Plot the AMSS-NCKU program results
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory)))
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory)))
 ## Plot black hole separation vs. time
 plot_tasks.append((plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory)))
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i)))
    plot_tasks.append((plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i)))
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i)))
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
    plot_tasks.append((plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i)))
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data (runs serially, not in the parallel pool)
 plot_xiaoqu.generate_binary_data_plot(binary_results_directory, figure_directory)
 print()
 print(" Plotting completed successfully. ")
 print()
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -198,16 +198,16 @@ int main(int argc, char *argv[])
      if (myrank == 0)
      {
            string out_dir;
-            string filename;
+            char filename[50];
            map<string, string>::iterator iter;
            iter = parameters::str_par.find("output dir");
            if (iter != parameters::str_par.end())
            {
                  out_dir = iter->second;
            }
-            filename = out_dir + "/setting.par";
+            sprintf(filename, "%s/setting.par", out_dir.c_str());
            ofstream setfile;
-            setfile.open(filename.c_str(), ios::trunc);
+            setfile.open(filename, ios::trunc);
            if (!setfile.good())
            {
@@ -484,11 +484,7 @@ int main(int argc, char *argv[])
            cout << endl;
      }
-      // Let the process teardown reclaim the simulation object.  Some derived
+      delete ADM;
      // equation classes keep MPI/CUDA-backed state whose destructor ordering
      // is fragile at program shutdown.
      if (getenv("AMSS_DELETE_ADM_ON_EXIT"))
            delete ADM;
      //=======================caculation done=============================================================
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -12,61 +12,7 @@ using namespace std;
 #include "Block.h"
 #include "misc.h"
-#if USE_CUDA_BSSN || USE_CUDA_Z4C
+Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
 #include <cuda_runtime_api.h>
 #endif
 namespace {
 bool cuda_pin_gridfuncs_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double *alloc_gridfunc(size_t count, unsigned char &pinned)
 {
  pinned = 0;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (cuda_pin_gridfuncs_enabled())
  {
    double *ptr = 0;
    cudaError_t err = cudaMallocHost((void **)&ptr, count * sizeof(double));
    if (err == cudaSuccess)
    {
      pinned = 1;
      return ptr;
    }
    cudaGetLastError();
  }
 #endif
  return (double *)malloc(sizeof(double) * count);
 }
 void free_gridfunc(double *ptr, unsigned char pinned)
 {
  if (!ptr)
    return;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (pinned)
  {
    cudaFreeHost(ptr);
    return;
  }
 #else
  (void)pinned;
 #endif
  free(ptr);
 }
 }
 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), lev(levi), cgpu(cgpui), ingfs(ingfsi), fngfs(fngfsi), igfs(0), fgfs(0), fgfs_pinned(0)
 {
  for (int i = 0; i < dim; i++)
    X[i] = 0;
@@ -124,10 +70,9 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
    int nn = shape[0] * shape[1] * shape[2];
    fgfs = new double *[fngfs];
    fgfs_pinned = new unsigned char[fngfs];
    for (int i = 0; i < fngfs; i++)
    {
-      fgfs[i] = alloc_gridfunc((size_t)nn, fgfs_pinned[i]);
+      fgfs[i] = (double *)malloc(sizeof(double) * nn);
      if (!(fgfs[i]))
      {
        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
@@ -162,13 +107,11 @@ Block::~Block()
      free(igfs[i]);
    delete[] igfs;
    for (int i = 0; i < fngfs; i++)
-      free_gridfunc(fgfs[i], fgfs_pinned ? fgfs_pinned[i] : 0);
+      free(fgfs[i]);
    delete[] fgfs;
    delete[] fgfs_pinned;
    X[0] = X[1] = X[2] = 0;
    igfs = 0;
    fgfs = 0;
    fgfs_pinned = 0;
  }
 }
 void Block::checkBlock()
@@ -244,8 +187,6 @@ void Block::swapList(MyList<var> *VarList1, MyList<var> *VarList2, int myrank)
    while (varl1 && varl2)
    {
      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
      if (fgfs_pinned)
        misc::swap<unsigned char>(fgfs_pinned[varl1->data->sgfn], fgfs_pinned[varl2->data->sgfn]);
      varl1 = varl1->next;
      varl2 = varl2->next;
    }
--- a/AMSS_NCKU_source/Block.h
+++ b/AMSS_NCKU_source/Block.h
@@ -18,10 +18,9 @@ public:
   int ingfs, fngfs;
   int *(*igfs);
   double *(*fgfs);
   unsigned char *fgfs_pinned;
 public:
-   Block() : rank(0), lev(0), cgpu(0), ingfs(0), fngfs(0), igfs(0), fgfs(0), fgfs_pinned(0) {};
+   Block() {};
   Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfs, int levi, const int cgpui = 0);
   ~Block();
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -14,9 +14,6 @@ using namespace std;
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #ifdef INTERP_LB_PROFILE
 #include "interp_lb_profile.h"
 #endif
@@ -181,444 +178,6 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
  return -1;
 }
 inline int fortran_idint_local(double x)
 {
  return int(x);
 }
 bool interp_fast_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_gpu_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_GPU");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_fast_compare_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double interp_fast_compare_tol()
 {
  static double tol = -1.0;
  if (tol < 0.0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL");
    tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11;
  }
  return tol;
 }
 long long interp_fast_compare_limit()
 {
  static long long limit = -1;
  if (limit < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT");
    limit = (env && atoll(env) > 0) ? atoll(env) : 4096;
  }
  return limit;
 }
 struct FastInterpStencil
 {
  int cxB[dim];
  double cx[dim];
  double wx[8];
  double wy[8];
  double wz[8];
  int nsamples;
  int loc[512];
  unsigned char sign_mask[512];
  double weight[512];
 };
 inline void lagrange_unit_weights(double x, int ordn, double *w)
 {
  for (int i = 0; i < ordn; i++)
  {
    double num = 1.0;
    double den = 1.0;
    for (int j = 0; j < ordn; j++)
    {
      if (j == i)
        continue;
      num *= (x - double(j));
      den *= double(i - j);
    }
    w[i] = num / den;
  }
 }
 inline void z_unit_weights(double x, int ordn, double *w)
 {
  if (ordn == 6)
  {
    static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0};
    for (int i = 0; i < 6; i++)
    {
      if (x == double(i))
      {
        for (int j = 0; j < 6; j++)
          w[j] = (j == i) ? 1.0 : 0.0;
        return;
      }
    }
    double den = 0.0;
    for (int i = 0; i < 6; i++)
    {
      w[i] = c_uniform[i] / (x - double(i));
      den += w[i];
    }
    for (int i = 0; i < 6; i++)
      w[i] /= den;
    return;
  }
  lagrange_unit_weights(x, ordn, w);
 }
 inline bool fast_interp_map_index(int idx, int extent, int d,
                                  int &mapped, unsigned char &mask)
 {
  if (idx > 0)
    mapped = idx;
  else
  {
    mask |= (unsigned char)(1u << d);
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    mapped = 2 - idx;
 #else
 #ifdef Cell
    mapped = 1 - idx;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
  }
  return mapped >= 1 && mapped <= extent;
 }
 bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn,
                                 int Symmetry, FastInterpStencil &st)
 {
  if (!BP || ordn <= 0 || ordn > 8)
    return false;
  st.nsamples = 0;
  const int NO_SYMM = 0;
  const int OCTANT = 2;
  int cmin[dim], cmax[dim], cxT[dim];
  for (int d = 0; d < dim; d++)
  {
    const double *X = BP->X[d];
    const double dX = X[1] - X[0];
    const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1;
    st.cxB[d] = cxI - ordn / 2 + 1;
    cxT[d] = st.cxB[d] + ordn - 1;
    cmin[d] = 1;
    cmax[d] = BP->shape[d];
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
 #else
 #ifdef Cell
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (st.cxB[d] < cmin[d])
    {
      st.cxB[d] = cmin[d];
      cxT[d] = st.cxB[d] + ordn - 1;
    }
    if (cxT[d] > cmax[d])
    {
      cxT[d] = cmax[d];
      st.cxB[d] = cxT[d] + 1 - ordn;
    }
    if (st.cxB[d] > 0)
      st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX;
    else
    {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
      st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX;
 #else
 #ifdef Cell
      st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    }
  }
  lagrange_unit_weights(st.cx[0], ordn, st.wx);
  lagrange_unit_weights(st.cx[1], ordn, st.wy);
  z_unit_weights(st.cx[2], ordn, st.wz);
  for (int kk = 0; kk < ordn; kk++)
  {
    for (int jj = 0; jj < ordn; jj++)
    {
      for (int ii = 0; ii < ordn; ii++)
      {
        unsigned char mask = 0;
        int ix, iy, iz;
        if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) ||
            !fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) ||
            !fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask))
          return false;
        const int s = st.nsamples++;
        st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] +
                    (iz - 1) * BP->shape[0] * BP->shape[1];
        st.sign_mask[s] = mask;
        st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk];
      }
    }
  }
  return true;
 }
 bool interpolate_var_list_with_stencil(Block *BP, MyList<var> *VarList,
                                       int num_var, const double *pox,
                                       int ordn, int Symmetry,
                                       const FastInterpStencil &st,
                                       double *out)
 {
  if (num_var <= 0 || num_var > 128)
    return false;
  double *data_ptrs[128];
  double *soa_ptrs[128];
  var *vars[128];
  MyList<var> *varl = VarList;
  int k = 0;
  while (varl)
  {
    if (k >= num_var)
      return false;
    vars[k] = varl->data;
    data_ptrs[k] = BP->fgfs[vars[k]->sgfn];
    soa_ptrs[k] = vars[k]->SoA;
    out[k] = 0.0;
    varl = varl->next;
    k++;
  }
  if (k != num_var)
    return false;
  for (int s = 0; s < st.nsamples; s++)
  {
    const int loc = st.loc[s];
    const double w = st.weight[s];
    const unsigned char mask = st.sign_mask[s];
    if (mask == 0)
    {
      for (int v = 0; v < num_var; v++)
        out[v] += w * data_ptrs[v][loc];
    }
    else
    {
      for (int v = 0; v < num_var; v++)
      {
        const double *SoA = soa_ptrs[v];
        double sgn = 1.0;
        if (mask & 1u)
          sgn *= SoA[0];
        if (mask & 2u)
          sgn *= SoA[1];
        if (mask & 4u)
          sgn *= SoA[2];
        out[v] += w * sgn * data_ptrs[v][loc];
      }
    }
  }
  if (interp_fast_compare_enabled())
  {
    static int report_count = 0;
    static long long compare_calls = 0;
    if (compare_calls++ >= interp_fast_compare_limit())
      return true;
    const double tol = interp_fast_compare_tol();
    varl = VarList;
    k = 0;
    while (varl)
    {
      var *vp = vars[k];
      double ref = 0.0;
      double x = pox[0], y = pox[1], z = pox[2];
      f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
                      BP->fgfs[vp->sgfn], ref,
                      x, y, z, ordn, vp->SoA, Symmetry);
      const double diff = fabs(ref - out[k]);
      const double scale = 1.0 + fabs(ref);
      if (diff > tol * scale && report_count < 32)
      {
        int rank = 0;
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        fprintf(stderr,
                "[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n",
                rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]);
        report_count++;
      }
      varl = varl->next;
      k++;
    }
  }
  return true;
 }
 bool interpolate_var_list_fast(Block *BP, MyList<var> *VarList, int num_var,
                               const double *pox, int ordn, int Symmetry,
                               double *out)
 {
  if (!interp_fast_enabled())
    return false;
  FastInterpStencil st;
  if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st))
    return false;
  return interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
                                           ordn, Symmetry, st, out);
 }
 struct CachedInterpPoint
 {
  Block *bp;
  int owner_rank;
  FastInterpStencil stencil;
 };
 struct SurfaceInterpCache
 {
  Patch *patch;
  int NN;
  int symmetry;
  double key[9];
  vector<CachedInterpPoint> points;
  SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {}
 };
 bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch,
                               int NN, double **XX, int Symmetry)
 {
  if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry ||
      int(cache.points.size()) != NN || NN <= 0)
    return false;
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      if (cache.key[p++] != XX[d][ids[q]])
        return false;
  return true;
 }
 SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX,
                                       int Symmetry)
 {
  static vector<SurfaceInterpCache> caches;
  for (size_t i = 0; i < caches.size(); i++)
    if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry))
      return &caches[i];
  if (caches.size() >= 24)
    caches.erase(caches.begin());
  caches.push_back(SurfaceInterpCache());
  return &caches.back();
 }
 bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN,
                         double **XX, int Symmetry, const double *DH,
                         const BlockBinIndex &block_index, int ordn)
 {
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  cache.patch = patch;
  cache.NN = NN;
  cache.symmetry = Symmetry;
  cache.points.clear();
  cache.points.resize(NN);
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      cache.key[p++] = XX[d][ids[q]];
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int d = 0; d < dim; d++)
      pox[d] = XX[d][j];
    const int block_i = find_block_index_for_point(block_index, pox, DH);
    if (block_i < 0)
    {
      cache.points[j].bp = 0;
      cache.points[j].owner_rank = -1;
      continue;
    }
    Block *BP = block_index.views[block_i].bp;
    cache.points[j].bp = BP;
    cache.points[j].owner_rank = BP->rank;
    cache.points[j].stencil.nsamples = 0;
    if (BP->rank == myrank)
    {
      if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry,
                                       cache.points[j].stencil))
        return false;
    }
  }
  return true;
 }
 } // namespace
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
@@ -1006,18 +565,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (myrank == BP->rank)
      {
        //---> interpolation
-        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
+        varl = VarList;
-                                       Symmetry, Shellf + j * num_var))
+        int k = 0;
        while (varl) // run along variables
        {
-          varl = VarList;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          int k = 0;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-          while (varl) // run along variables
+          varl = varl->next;
-          {
+          k++;
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
    }
@@ -1104,6 +659,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
  memset(Shellf, 0, sizeof(double) * NN * num_var);
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
@@ -1115,113 +672,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  SurfaceInterpCache *surface_cache = 0;
  bool use_surface_cache = false;
  if (interp_fast_enabled())
  {
    surface_cache = find_surface_cache(this, NN, XX, Symmetry);
    use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry);
    if (!use_surface_cache)
      use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX,
                                              Symmetry, DH, block_index, ordn);
  }
  // --- Interpolation phase (identical to original) ---
 #if USE_CUDA_BSSN
  const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
                              VarList && VarList->next && !VarList->next->next;
 #else
  const bool use_gpu_interp = false;
 #endif
  if (use_gpu_interp)
  {
 #if USE_CUDA_BSSN
    vector<vector<int> > local_points(block_index.views.size());
    for (int j = 0; j < NN; j++)
    {
      for (int i = 0; i < dim; i++)
      {
        if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
        {
          cout << "Patch::Interp_Points: point (";
          for (int k = 0; k < dim; k++)
          {
            cout << XX[k][j];
            if (k < dim - 1)
              cout << ",";
            else
              cout << ") is out of current Patch." << endl;
          }
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      CachedInterpPoint &cp = surface_cache->points[j];
      Block *BP = cp.bp;
      owner_rank[j] = cp.owner_rank;
      if (BP && myrank == BP->rank)
      {
        for (size_t bi = 0; bi < block_index.views.size(); bi++)
        {
          if (block_index.views[bi].bp == BP)
          {
            local_points[bi].push_back(j);
            break;
          }
        }
      }
    }
    var *v0 = VarList->data;
    var *v1 = VarList->next->data;
    double soa6[6] = {
        v0->SoA[0], v0->SoA[1], v0->SoA[2],
        v1->SoA[0], v1->SoA[1], v1->SoA[2]};
    for (size_t bi = 0; bi < local_points.size(); bi++)
    {
      const int count = int(local_points[bi].size());
      if (count <= 0)
        continue;
      Block *BP = block_index.views[bi].bp;
      vector<double> px(count), py(count), pz(count), out(2 * count);
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        px[q] = XX[0][j];
        py[q] = XX[1][j];
        pz[q] = XX[2][j];
      }
      const double dx = BP->X[0][1] - BP->X[0][0];
      const double dy = BP->X[1][1] - BP->X[1][0];
      const double dz = BP->X[2][1] - BP->X[2][0];
      const int ok = bssn_cuda_interp_host_two_fields(
          BP, BP->shape,
          BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
          BP->X[0][0], BP->X[1][0], BP->X[2][0],
          dx, dy, dz,
          &px[0], &py[0], &pz[0], count,
          ordn, Symmetry, soa6, &out[0]);
      if (ok != 0)
      {
        if (myrank == 0)
          cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        Shellf[j * num_var] = out[2 * q];
        Shellf[j * num_var + 1] = out[2 * q + 1];
      }
    }
 #endif
  }
  else
  {
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
@@ -1243,56 +695,25 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    if (use_surface_cache)
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
    if (block_i >= 0)
    {
-      CachedInterpPoint &cp = surface_cache->points[j];
+      Block *BP = block_index.views[block_i].bp;
-      Block *BP = cp.bp;
+      owner_rank[j] = BP->rank;
      owner_rank[j] = cp.owner_rank;
      if (BP && myrank == BP->rank)
      {
        if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
                                               ordn, Symmetry, cp.stencil,
                                               Shellf + j * num_var))
        {
          MyList<var> *varl_fallback = VarList;
          int k = 0;
          while (varl_fallback)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
            varl_fallback = varl_fallback->next;
            k++;
          }
        }
      }
    }
    else
    {
      const int block_i = find_block_index_for_point(block_index, pox, DH);
      if (block_i >= 0)
      {
        Block *BP = block_index.views[block_i].bp;
        owner_rank[j] = BP->rank;
      if (myrank == BP->rank)
      {
-        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
+        varl = VarList;
-                                       Symmetry, Shellf + j * num_var))
+        int k = 0;
        while (varl)
        {
-          MyList<var> *varl_fallback = VarList;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          int k = 0;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-          while (varl_fallback)
+          varl = varl->next;
-          {
+          k++;
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
            varl_fallback = varl_fallback->next;
            k++;
          }
        }
      }
      }
    }
  }
  }
 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
@@ -1548,18 +969,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (myrank == BP->rank)
      {
        //---> interpolation
-        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
+        varl = VarList;
-                                       Symmetry, Shellf + j * num_var))
+        int k = 0;
        while (varl) // run along variables
        {
-          varl = VarList;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          int k = 0;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-          while (varl) // run along variables
+          varl = varl->next;
-          {
+          k++;
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
    }
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -106,12 +106,6 @@ namespace Parallel
    int *recv_buf_caps;
    unsigned char *send_buf_pinned;
    unsigned char *recv_buf_pinned;
    unsigned char *send_buf_is_dev;
    unsigned char *recv_buf_is_dev;
    int *send_buf_caps_dev;
    int *recv_buf_caps_dev;
    double **send_bufs_dev;
    double **recv_bufs_dev;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
@@ -119,7 +113,6 @@ namespace Parallel
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    bool cuda_aware_mode;
    SyncCache();
    void invalidate();
    void destroy();
--- a/AMSS_NCKU_source/ShellPatch.C
+++ b/AMSS_NCKU_source/ShellPatch.C
--- a/AMSS_NCKU_source/ShellPatch.h
+++ b/AMSS_NCKU_source/ShellPatch.h
@@ -102,16 +102,6 @@ public:
                 //-1: means no dumy dimension at all; 0: means rho; 1: means sigma
   };
   // Thread-safe search result (no pointers to shared mutable state)
   struct PointSearchResult
   {
      bool found;
      Block *Bg;
      double gx, gy, gz; // global Cartesian coordinates
      double lx, ly, lz; // local coordinates within the found block
      int ssst;          // source shell-patch type (-1 = Cartesian)
   };
   int myrank;
   int shape[dim];   // for (rho, sigma, R), for rho and sigma means number of points for every pi/2
   double Rrange[2]; // for Rmin and Rmax
@@ -185,12 +175,6 @@ public:
                         MyList<Patch> *Pp, double CDH[dim], MyList<pointstru> *pss);
   bool prolongpointstru(MyList<pointstru> *&psul, bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                         MyList<Patch> *Pp, double CDH[dim], double x, double y, double z, int Symmetry, int rank_in);
   // Read-only point search — thread-safe (no shared mutable state modified)
   PointSearchResult prolongpointstru_search(bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                                             MyList<Patch> *Pp, double CDH[dim], double x, double y, double z,
                                             int Symmetry, int rank_in);
   // Append a search result to a linked list — use inside omp critical section
   void prolongpointstru_append(MyList<pointstru> *&psul, const PointSearchResult &sr, int tsst);
   void setupintintstuff(int cpusize, MyList<Patch> *CPatL, int Symmetry);
   void intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -3,7 +3,6 @@
 #include <sstream>
 #include <cstdio>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -36,12 +35,6 @@ using namespace std;
 #if USE_CUDA_Z4C && (ABEtype == 2)
 #include "z4c_rhs_cuda.h"
 #endif
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #ifdef WithShell
 #include "bssn_gpu.h"
 #endif
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
@@ -52,81 +45,6 @@ using namespace std;
 // Define Z4c_class
 #if USE_CUDA_Z4C && (ABEtype == 2) && defined(WithShell)
 // GPU-accelerated Z4C shell RHS: same parameter signature as f_compute_rhs_Z4c_ss.
 // Internally calls gpu_rhs_z4c_ss which modifies trK→trKd before upload,
 // runs BSSN algebraic kernels, then applies Z4C post-processing (TZ_rhs, damping).
 extern "C" {
 static int cuda_compute_rhs_z4c_ss(
    int *ex, double &T, double *crho, double *sigma, double *R,
    double *X, double *Y, double *Z,
    double *drhodx, double *drhody, double *drhodz,
    double *dsigmadx, double *dsigmady, double *dsigmadz,
    double *dRdx, double *dRdy, double *dRdz,
    double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz,
    double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
    double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz,
    double *chi, double *trK,
    double *gxx, double *gxy, double *gxz, double *gyy, double *gyz, double *gzz,
    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
    double *Gamx, double *Gamy, double *Gamz,
    double *Lap, double *betax, double *betay, double *betaz,
    double *dtSfx, double *dtSfy, double *dtSfz,
    double *TZ,
    double *chi_rhs, double *trK_rhs,
    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
    double *TZ_rhs,
    double *rho_mat, double *Sx, double *Sy, double *Sz,
    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
    double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
    double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
    int &Symmetry, int &Lev, double &eps, int &sst, int &co)
 {
    return gpu_rhs_z4c_ss(0, 0,  // calledby=ABE_main, mpi_rank=device_0
                     ex, T, crho, sigma, R, X, Y, Z,
                     drhodx, drhody, drhodz,
                     dsigmadx, dsigmady, dsigmadz,
                     dRdx, dRdy, dRdz,
                     drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
                     dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
                     dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
                     chi, trK,
                     gxx, gxy, gxz, gyy, gyz, gzz,
                     Axx, Axy, Axz, Ayy, Ayz, Azz,
                     Gamx, Gamy, Gamz,
                     Lap, betax, betay, betaz,
                     dtSfx, dtSfy, dtSfz,
                     TZ,
                     chi_rhs, trK_rhs,
                     gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
                     Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
                     Gamx_rhs, Gamy_rhs, Gamz_rhs,
                     Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
                     dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
                     TZ_rhs,
                     rho_mat, Sx, Sy, Sz,
                     Sxx, Sxy, Sxz, Syy, Syz, Szz,
                     Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                     Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                     Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                     Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                     ham_Res, movx_Res, movy_Res, movz_Res,
                     Gmx_Res, Gmy_Res, Gmz_Res,
                     Symmetry, Lev, eps, sst, co);
 }
 }
 // Redirect all Z4C shell RHS calls in Step/SHStep to GPU
 #define f_compute_rhs_Z4c_ss cuda_compute_rhs_z4c_ss
 #endif
 // This class inherits some members and methods from the parent `bssn_class` and modifies others.
 // The modified members and methods are defined below (and in the header Z4c_class.h).
 // The remaining members/methods are inherited from `bssn_class` (declared in bssn_class.h).
@@ -222,13 +140,6 @@ void Z4c_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -268,6 +179,9 @@ Z4c_class::~Z4c_class()
 // for sommerfeld boundary
 #if USE_CUDA_Z4C && (ABEtype == 2)
 #ifdef WithShell
 #error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only"
 #endif
 #if (MRBD == 2)
 #error "USE_CUDA_Z4C resident path does not support MRBD == 2"
 #endif
@@ -538,17 +452,6 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
  return true;
 }
 bool z4c_cuda_resident_step_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 } // namespace
 #endif
@@ -568,14 +471,6 @@ void Z4c_class::Step(int lev, int YN)
  int pre = 0, cor = 1;
  int ERROR = 0;
 #ifdef WithShell
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -603,7 +498,7 @@ void Z4c_class::Step(int lev, int YN)
 #elif (MRBD == 1)
        apply_bam_bc = 1;
 #endif
-        int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
+        int keep_resident_state = 1;
        int apply_enforce_ga = 0;
 #if (AGM == 0)
        apply_enforce_ga = 1;
@@ -642,7 +537,7 @@ void Z4c_class::Step(int lev, int YN)
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
-  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  if (BH_num > 0 && lev == GH->levels - 1)
  {
@@ -698,7 +593,7 @@ void Z4c_class::Step(int lev, int YN)
 #elif (MRBD == 1)
          apply_bam_bc = 1;
 #endif
-          int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
+          int keep_resident_state = 1;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
@@ -740,25 +635,18 @@ void Z4c_class::Step(int lev, int YN)
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    if (BH_num > 0 && lev == GH->levels - 1)
    {
-      if (z4c_cuda_resident_step_enabled())
+      if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
                                              Porg, Porg1,
                                              Sfx, Sfy, Sfz, Symmetry))
      {
-        if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
+        if (myrank == 0 && ErrorMonitor->outfile)
-                                                Porg, Porg1,
+          ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
-                                                Sfx, Sfy, Sfz, Symmetry))
+                                << PhysTime << endl;
-        {
+        MPI_Abort(MPI_COMM_WORLD, 1);
          if (myrank == 0 && ErrorMonitor->outfile)
            ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
                                  << PhysTime << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      else
      {
        compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      }
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
@@ -803,7 +691,7 @@ void Z4c_class::Step(int lev, int YN)
    }
  }
-  z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false);
+  z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
 #if (RPS == 0)
  RestrictProlong(lev, YN, BB);
@@ -1001,13 +889,6 @@ void Z4c_class::Step(int lev, int YN)
  }
 #ifdef WithShell
 #if USE_CUDA_Z4C
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  // evolve Shell Patches
  if (lev == 0)
  {
@@ -1715,7 +1596,9 @@ void Z4c_class::Step(int lev, int YN)
 }
 #else
 // for constraint preserving boundary (CPBC)
-// Note: CPBC path uses CPU Fortran RHS; GPU resident sync is a no-op here.
+#if USE_CUDA_Z4C && (ABEtype == 2)
 #error "USE_CUDA_Z4C resident path does not support CPBC"
 #endif
 #ifndef WithShell
 #error "CPBC only supports Shell"
 #endif
@@ -1745,14 +1628,6 @@ void Z4c_class::Step(int lev, int YN)
  int pre = 0, cor = 1;
  int ERROR = 0;
 #if USE_CUDA_Z4C && defined(WithShell)
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  MyList<ss_patch> *sPp;
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
@@ -3083,11 +2958,6 @@ void Z4c_class::Check_extrop()
 //================================================================================================
 #if USE_CUDA_Z4C && (ABEtype == 2) && defined(WithShell)
 #undef f_compute_rhs_Z4c_ss
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
 #endif
 // this member function is used to compute and output constraint violation
 //================================================================================================
@@ -3363,12 +3233,11 @@ void Z4c_class::Interp_Constraint()
  }
  ofstream outfile;
-  char suffix[64];
+  char filename[50];
-  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
+  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename.c_str());
+  outfile.open(filename);
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -2,9 +2,7 @@
 #ifdef newc
 #include <sstream>
 #include <cstdio>
 #include <cstdlib>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -12,7 +10,6 @@ using namespace std;
 #endif
 #include <time.h>
 #include <cstring>
 #include "macrodef.h"
 #include "misc.h"
@@ -22,9 +19,6 @@ using namespace std;
 #include "bssnEM_class.h"
 #include "bssn_rhs.h"
 #include "empart.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #include "initial_puncture.h"
 #include "initial_maxwell.h"
 #include "enforce_algebra.h"
@@ -42,387 +36,6 @@ using namespace std;
 //================================================================================================
 namespace
 {
 MyList<var> *advance_var_list(MyList<var> *vars, int count)
 {
  while (vars && count > 0)
  {
    vars = vars->next;
    --count;
  }
  return vars;
 }
 bool bssn_em_step_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_step_timing_all_levels_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 #if USE_CUDA_BSSN
 bool bssn_em_zero_analysis_fastpath_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_zero_resident_download_fastpath_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_resident_zero_fastpath_ready(MyList<Patch> *PatL,
 #ifdef WithShell
                                          ShellPatch *shell,
 #else
                                          ShellPatch * /*shell*/,
 #endif
                                          int rank)
 {
  int local_ok = 1;
  int local_seen = 0;
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (rank == cg->rank)
      {
        local_seen = 1;
        if (!bssn_em_cuda_resident_zero_fast_state(cg))
          local_ok = 0;
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 #ifdef WithShell
  if (shell && shell->PatL)
  {
    MyList<ss_patch> *SP = shell->PatL;
    while (SP)
    {
      MyList<Block> *BP = SP->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (rank == cg->rank)
        {
          local_seen = 1;
          if (!bssn_em_cuda_resident_zero_fast_state(cg))
            local_ok = 0;
        }
        if (BP == SP->data->ble)
          break;
        BP = BP->next;
      }
      SP = SP->next;
    }
  }
 #endif
  int global_ok = 0;
  int global_seen = 0;
  MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
  return global_seen && global_ok;
 }
 bool bssn_em_analysis_zero_fastpath_ready(MyList<Patch> *PatL,
 #ifdef WithShell
                                          ShellPatch *shell,
 #else
                                          ShellPatch *shell,
 #endif
                                          int rank)
 {
  if (!bssn_em_zero_analysis_fastpath_enabled())
    return false;
  return bssn_em_resident_zero_fastpath_ready(PatL, shell, rank);
 }
 void zero_em_analysis_outputs(MyList<Patch> *PatL,
 #ifdef WithShell
                              ShellPatch *shell,
 #else
                              ShellPatch * /*shell*/,
 #endif
                              int rank,
                              var *Rphi2_var, var *Iphi2_var,
                              var *Rphi1_var, var *Iphi1_var)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (rank == cg->rank)
      {
        const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
        std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 #ifdef WithShell
  if (shell && shell->PatL)
  {
    MyList<ss_patch> *SP = shell->PatL;
    while (SP)
    {
      MyList<Block> *BP = SP->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (rank == cg->rank)
        {
          const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
          std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
        }
        if (BP == SP->data->ble)
          break;
        BP = BP->next;
      }
      SP = SP->next;
    }
  }
 #endif
 }
 #endif
 int bssn_em_step_timing_every()
 {
  static int every = -1;
  if (every < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY");
    every = (env && atoi(env) > 0) ? atoi(env) : 1;
  }
  return every;
 }
 #if USE_CUDA_BSSN
 bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList<var> *vars,
                                  double **host_views,
                                  double *propspeeds = 0,
                                  double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_CUDA_STATE_COUNT;
 }
 bool fill_bssn_em_cuda_views(Block *cg, MyList<var> *vars,
                             double **host_views,
                             double *propspeeds = 0,
                             double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_EM_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0;
 }
 void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources,
                                          var *Jx, var *Jy, var *Jz, var *qchar)
 {
  sources[0] = cg->fgfs[Jx->sgfn];
  sources[1] = cg->fgfs[Jy->sgfn];
  sources[2] = cg->fgfs[Jz->sgfn];
  sources[3] = cg->fgfs[qchar->sgfn];
 }
 void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
                                    var *rho, var *Sx, var *Sy, var *Sz,
                                    var *Sxx, var *Sxy, var *Sxz,
                                    var *Syy, var *Syz, var *Szz)
 {
  matter[0] = cg->fgfs[rho->sgfn];
  matter[1] = cg->fgfs[Sx->sgfn];
  matter[2] = cg->fgfs[Sy->sgfn];
  matter[3] = cg->fgfs[Sz->sgfn];
  matter[4] = cg->fgfs[Sxx->sgfn];
  matter[5] = cg->fgfs[Sxy->sgfn];
  matter[6] = cg->fgfs[Sxz->sgfn];
  matter[7] = cg->fgfs[Syy->sgfn];
  matter[8] = cg->fgfs[Syz->sgfn];
  matter[9] = cg->fgfs[Szz->sgfn];
 }
 bool bssn_em_cuda_use_resident_sync(int lev)
 {
 #ifdef WithShell
  (void)lev;
  return false;
 #else
  return true;
 #endif
 }
 bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
  static int keep_all_levels = -1;
  if (keep_all_levels < 0)
  {
    const char *env = getenv("AMSS_CUDA_EM_KEEP_ALL_LEVELS");
    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
  }
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_EM_KEEP_RESIDENT_AFTER_STEP");
    if (!env)
      env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  if (!enabled)
    return false;
  if (lev == analysis_lev)
    return false;
  if (keep_all_levels)
    return true;
  return lev < trfls_in;
 }
 void bssn_em_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                       int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_out[BSSN_EM_CUDA_STATE_COUNT];
        if (!fill_bssn_em_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA BSSN-EM resident state list mismatch during download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape,
                                                                state_out,
                                                                BSSN_EM_CUDA_STATE_COUNT))
        {
          cout << "CUDA BSSN-EM resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          bssn_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_em_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                        int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_key[BSSN_EM_CUDA_STATE_COUNT];
        if (!fill_bssn_em_cuda_views(cg, vars, state_key))
        {
          cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape,
                                                     state_key,
                                                     BSSN_EM_CUDA_STATE_COUNT))
        {
          cout << "CUDA BSSN-EM keep-only resident state failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 #endif
 }
 // Define bssnEM_class
 // It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -645,13 +258,6 @@ void bssnEM_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -1227,25 +833,9 @@ void bssnEM_class::Step(int lev, int YN)
  int iter_count = 0; // count RK4 substeps
  int pre = 0, cor = 1;
  int ERROR = 0;
 #if USE_CUDA_BSSN
  const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev);
 #endif
  const bool em_step_timing = bssn_em_step_timing_enabled();
  const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0;
  double em_t0 = 0.0;
  double em_t_predictor = 0.0;
  double em_t_predictor_sync = 0.0;
  double em_t_corrector = 0.0;
  double em_t_corrector_sync = 0.0;
  double em_t_analysis = 0.0;
  double em_t_bh = 0.0;
  double em_t_swap = 0.0;
  double em_t_resident = 0.0;
  double em_t_rp = 0.0;
  MyList<ss_patch> *sPp;
  // Predictor
  em_t0 = em_step_timing ? MPI_Wtime() : 0.0;
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -1255,20 +845,15 @@ void bssnEM_class::Step(int lev, int YN)
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
 #if !USE_CUDA_BSSN
 #if (AGM == 0)
        f_enforce_ga(cg->shape,
                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
 #endif
-        int em_rhs_error = 0;
+        if (
        bool used_gpu_substep = false;
 #if !USE_CUDA_BSSN
        em_rhs_error =
            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
@@ -1288,52 +873,8 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                 Symmetry, lev, ndeps);
+                                 Symmetry, lev, ndeps) ||
-#endif
+            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
 #if USE_CUDA_BSSN
        if (!em_rhs_error)
        {
          double *state_in[BSSN_EM_CUDA_STATE_COUNT];
          double *state_out[BSSN_EM_CUDA_STATE_COUNT];
          double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
          double propspeed[BSSN_EM_CUDA_STATE_COUNT];
          double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
          if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
              !fill_bssn_em_cuda_views(cg, SynchList_pre, state_out))
          {
            cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
          int apply_bam_bc = 0;
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #endif
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          if (bssn_em_cuda_rk4_substep(cg,
                                       cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                       state_in, state_out, sources,
                                       propspeed, soa_flat, Pp->data->bbox,
                                       dT_lev, TRK4, iter_count, apply_bam_bc,
                                       Symmetry, lev, ndeps, pre,
                                       keep_resident_state, apply_enforce_ga, chitiny))
          {
            ERROR = 1;
          }
          used_gpu_substep = true;
        }
 #endif
        if (em_rhs_error ||
            (!used_gpu_substep &&
             f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -1366,7 +907,7 @@ void bssnEM_class::Step(int lev, int YN)
                               cg->fgfs[Cons_Ham->sgfn],
                               cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                               cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                               Symmetry, lev, ndeps, pre)))
+                               Symmetry, lev, ndeps, pre))
        {
          cout << "find NaN in domain: (" 
               << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -1375,8 +916,6 @@ void bssnEM_class::Step(int lev, int YN)
          ERROR = 1;
        }
        if (!used_gpu_substep)
        {
        // rk4 substep and boundary
        {
          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
@@ -1418,7 +957,6 @@ void bssnEM_class::Step(int lev, int YN)
          }
        }
        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
        }
      }
      if (BP == Pp->data->ble)
        break;
@@ -1426,8 +964,6 @@ void bssnEM_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
  if (em_step_timing)
    em_t_predictor += MPI_Wtime() - em_t0;
  // check error information
  {
    int erh = ERROR;
@@ -1685,11 +1221,7 @@ void bssnEM_class::Step(int lev, int YN)
  }
 #endif
-  if (em_step_timing)
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
    em_t0 = MPI_Wtime();
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  if (em_step_timing)
    em_t_predictor_sync += MPI_Wtime() - em_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1712,8 +1244,6 @@ void bssnEM_class::Step(int lev, int YN)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
@@ -1742,24 +1272,16 @@ void bssnEM_class::Step(int lev, int YN)
        DG_List->clearList();
      }
    }
    if (em_step_timing)
      em_t_bh += MPI_Wtime() - em_t0;
  }
  // data analysis part
  // Warning NOTE: the variables1 are used as temp storege room
  if (lev == a_lev)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    AnalysisStuff_EM(lev, dT_lev);
    if (em_step_timing)
      em_t_analysis += MPI_Wtime() - em_t0;
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
@@ -1772,7 +1294,6 @@ void bssnEM_class::Step(int lev, int YN)
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
 #if !USE_CUDA_BSSN
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
@@ -1786,13 +1307,9 @@ void bssnEM_class::Step(int lev, int YN)
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
 #endif
-          int em_rhs_error = 0;
+          if (
          bool used_gpu_substep = false;
 #if !USE_CUDA_BSSN
          em_rhs_error =
              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                   cg->fgfs[phi->sgfn],
                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
@@ -1812,55 +1329,8 @@ void bssnEM_class::Step(int lev, int YN)
                                   cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                   Symmetry, lev, ndeps);
+                                   Symmetry, lev, ndeps) ||
-#endif
+              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
 #if USE_CUDA_BSSN
          if (!em_rhs_error)
          {
            double *state_in[BSSN_EM_CUDA_STATE_COUNT];
            double *state_out[BSSN_EM_CUDA_STATE_COUNT];
            double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
            double propspeed[BSSN_EM_CUDA_STATE_COUNT];
            double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
            if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
                !fill_bssn_em_cuda_views(cg, SynchList_cor, state_out))
            {
              cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl;
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
            int apply_bam_bc = 0;
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
            int apply_enforce_ga = 0;
 #if (AGM == 0)
            apply_enforce_ga = 1;
 #elif (AGM == 1)
            if (iter_count == 3)
              apply_enforce_ga = 1;
 #endif
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            if (bssn_em_cuda_rk4_substep(cg,
                                         cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                         state_in, state_out, sources,
                                         propspeed, soa_flat, Pp->data->bbox,
                                         dT_lev, TRK4, iter_count, apply_bam_bc,
                                         Symmetry, lev, ndeps, cor,
                                         keep_resident_state, apply_enforce_ga, chitiny))
            {
              ERROR = 1;
            }
            used_gpu_substep = true;
          }
 #endif
          if (em_rhs_error ||
              (!used_gpu_substep &&
               f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1892,7 +1362,7 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Cons_Ham->sgfn],
                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, cor)))
+                                 Symmetry, lev, ndeps, cor))
          {
            cout << "find NaN in domain: (" 
                 << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -1900,8 +1370,6 @@ void bssnEM_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          if (!used_gpu_substep)
          {
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
@@ -1944,7 +1412,6 @@ void bssnEM_class::Step(int lev, int YN)
            }
          }
          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
          }
        }
        if (BP == Pp->data->ble)
          break;
@@ -2216,13 +1683,7 @@ void bssnEM_class::Step(int lev, int YN)
    }
 #endif
-    if (em_step_timing)
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
      em_t_corrector += MPI_Wtime() - em_t0;
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    if (em_step_timing)
      em_t_corrector_sync += MPI_Wtime() - em_t0;
 #ifdef WithShell
    if (lev == 0)
@@ -2244,8 +1705,6 @@ void bssnEM_class::Step(int lev, int YN)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      if (em_step_timing)
        em_t0 = MPI_Wtime();
      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
@@ -2274,14 +1733,10 @@ void bssnEM_class::Step(int lev, int YN)
          DG_List->clearList();
        }
      }
      if (em_step_timing)
        em_t_bh += MPI_Wtime() - em_t0;
    }
    // swap time level
    if (iter_count < 3)
    {
      if (em_step_timing)
        em_t0 = MPI_Wtime();
      Pp = GH->PatL[lev];
      while (Pp)
      {
@@ -2325,32 +1780,12 @@ void bssnEM_class::Step(int lev, int YN)
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
      if (em_step_timing)
        em_t_swap += MPI_Wtime() - em_t0;
    }
  }
 #if USE_CUDA_BSSN
  if (use_cuda_resident_sync)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    const bool needs_resident_download =
        !bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev);
    if (needs_resident_download)
      bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
    if (em_step_timing)
      em_t_resident += MPI_Wtime() - em_t0;
  }
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part
  if (em_step_timing)
    em_t0 = MPI_Wtime();
  RestrictProlong(lev, YN, BB);
  if (em_step_timing)
    em_t_rp += MPI_Wtime() - em_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -2378,8 +1813,6 @@ void bssnEM_class::Step(int lev, int YN)
  //
  // OldStateList  old -----------
  // update
  if (em_step_timing)
    em_t0 = MPI_Wtime();
  Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -2425,26 +1858,6 @@ void bssnEM_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  if (em_step_timing)
  {
    em_t_swap += MPI_Wtime() - em_t0;
    static int em_step_report_count = 0;
    const int em_timing_every = bssn_em_step_timing_every();
    const bool report_all_levels = bssn_em_step_timing_all_levels_enabled();
    if (lev == GH->levels - 1)
      ++em_step_report_count;
    if ((report_all_levels || lev == GH->levels - 1) &&
        (em_timing_every <= 1 || em_step_report_count % em_timing_every == 0))
    {
      fprintf(stderr,
              "[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f "
              "analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n",
              lev, MPI_Wtime() - em_step_t0,
              em_t_predictor, em_t_predictor_sync,
              em_t_analysis, em_t_corrector, em_t_corrector_sync,
              em_t_bh, em_t_swap, em_t_resident, em_t_rp);
    }
  }
 }
 //================================================================================================
@@ -2623,59 +2036,6 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
  if (LastAnas >= AnasTime)
  {
 #if USE_CUDA_BSSN
    const bool zero_em_analysis =
        bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev],
 #ifdef WithShell
                                             SH
 #else
                                             0
 #endif
                                             , myrank
        );
 #else
    const bool zero_em_analysis = false;
 #endif
    if (zero_em_analysis)
    {
 #if USE_CUDA_BSSN
      zero_em_analysis_outputs(GH->PatL[lev],
 #ifdef WithShell
                               SH,
 #else
                               0,
 #endif
                               myrank,
                               Rphi2, Iphi2, Rphi1, Iphi1);
 #endif
      int NN = 0;
      for (int pl = 1; pl < maxl + 1; pl++)
        for (int pm = -pl; pm < pl + 1; pm++)
          NN++;
      double *RP = new double[NN];
      double *IP = new double[NN];
      std::memset(RP, 0, NN * sizeof(double));
      std::memset(IP, 0, NN * sizeof(double));
      for (int i = 0; i < decn; i++)
        Phi2Monitor->writefile(PhysTime, NN, RP, IP);
      delete[] RP;
      delete[] IP;
      NN = 0;
      for (int pl = 0; pl < maxl + 1; pl++)
        for (int pm = -pl; pm < pl + 1; pm++)
          NN++;
      RP = new double[NN];
      IP = new double[NN];
      std::memset(RP, 0, NN * sizeof(double));
      std::memset(IP, 0, NN * sizeof(double));
      for (int i = 0; i < decn; i++)
        Phi1Monitor->writefile(PhysTime, NN, RP, IP);
      delete[] RP;
      delete[] IP;
    }
    else
    {
    Compute_Phi2(lev);
    double *RP, *IP;
    int NN = 0;
@@ -2764,7 +2124,6 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
    }
    delete[] RP;
    delete[] IP;
    }
  }
  AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here
@@ -2945,12 +2304,11 @@ void bssnEM_class::Interp_Constraint()
  }
  ofstream outfile;
-  char suffix[64];
+  char filename[50];
-  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
+  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename.c_str());
+  outfile.open(filename);
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -3,7 +3,6 @@
 #include <sstream>
 #include <cstdio>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -26,9 +25,6 @@ using namespace std;
 #include "getnp4.h"
 #include "shellfunctions.h"
 #include "parameters.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
@@ -37,300 +33,6 @@ using namespace std;
 //================================================================================================
 namespace
 {
 #if USE_CUDA_BSSN
 bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
                                  double **host_views,
                                  double *propspeeds = 0,
                                  double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0;
 }
 bool bssn_escalar_cuda_use_resident_sync(int lev)
 {
 #ifdef WithShell
  (void)lev;
  return false;
 #else
  return true;
 #endif
 }
 bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
  static int keep_all_levels = -1;
  if (keep_all_levels < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS");
    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
  }
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  if (!enabled)
    return false;
  if (lev == analysis_lev)
    return false;
  static int release_only_level = -2;
  if (release_only_level == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_RELEASE_ONLY_LEVEL");
    release_only_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  if (release_only_level >= 0)
    return lev != release_only_level;
  static int keep_level_limit = -2;
  if (keep_level_limit == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_LEVELS_BELOW");
    keep_level_limit = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  if (keep_level_limit >= 0)
    return lev < keep_level_limit;
  if (keep_all_levels)
    return true;
  return lev < trfls_in;
 }
 bool bssn_escalar_sync_merged_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 void bssn_escalar_sync_level(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
 {
  if (bssn_escalar_sync_merged_enabled())
    Parallel::Sync_merged(PatL, VarList, Symmetry);
  else
    Parallel::Sync(PatL, VarList, Symmetry);
 }
 bool bssn_escalar_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_ESCALAR_STEP_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_post_rp_download_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_DOWNLOAD");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_post_rp_download_level_enabled(int lev)
 {
  if (!bssn_escalar_cuda_post_rp_download_enabled())
    return false;
  static int min_level = -2;
  if (min_level == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_MIN_LEVEL");
    min_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  return min_level < 0 || lev >= min_level;
 }
 bool bssn_escalar_cuda_post_swap_release_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_SWAP_RELEASE");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_pre_rp_release_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_PRE_RP_RELEASE");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_bh_interp_resident_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_prune_after_swap_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_PRUNE_AFTER_SWAP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 void bssn_escalar_cuda_upload_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                          int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_in))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during upload" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_upload_resident_state(cg, cg->shape, state_in))
        {
          cout << "CUDA BSSN-EScalar resident state upload failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_escalar_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                             int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_key[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_key))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during prune" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_keep_only_resident_state(cg, cg->shape, state_key))
        {
          cout << "CUDA BSSN-EScalar resident state prune failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
                                double sync, double bh, double analysis, double swap,
                                double resident, double rp)
 {
  if (!bssn_escalar_timing_enabled())
    return;
  double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp};
  double maxv[8] = {};
  MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
  if (myrank == 0)
    fprintf(stderr,
            "[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f "
            "bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n",
            lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5],
            maxv[6], maxv[7],
            maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]);
 }
 void bssn_escalar_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                            int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out))
        {
          cout << "CUDA BSSN-EScalar resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          bssn_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 #endif
 }
 //================================================================================================
 // Define bssnEScalar_class
 // It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -477,11 +179,6 @@ void bssnEScalar_class::Initialize()
 bssnEScalar_class::~bssnEScalar_class()
 {
 #if USE_CUDA_BSSN
  for (int lev = 0; GH && lev < GH->levels; ++lev)
    bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true);
 #endif
  delete Sphio;
  delete Spio;
  delete Sphi0;
@@ -1011,11 +708,6 @@ void bssnEScalar_class::Read_Pablo()
 void bssnEScalar_class::Step(int lev, int YN)
 {
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #if USE_CUDA_BSSN
  const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev);
 #else
  const bool use_cuda_resident_sync = false;
 #endif
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
@@ -1027,19 +719,9 @@ void bssnEScalar_class::Step(int lev, int YN)
  int iter_count = 0; // count RK4 substeps
  int pre = 0, cor = 1;
  int ERROR = 0;
  const bool escalar_step_timing = bssn_escalar_timing_enabled();
  const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  double escalar_t_rhs = 0.0;
  double escalar_t_sync = 0.0;
  double escalar_t_bh = 0.0;
  double escalar_t_analysis = 0.0;
  double escalar_t_swap = 0.0;
  double escalar_t_resident = 0.0;
  double escalar_t_rp = 0.0;
  MyList<ss_patch> *sPp;
  // Predictor
  double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -1050,59 +732,14 @@ void bssnEScalar_class::Step(int lev, int YN)
      if (myrank == cg->rank)
      {
 #if (AGM == 0)
 #if !USE_CUDA_BSSN
        f_enforce_ga(cg->shape,
                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
 #endif
-        bool used_gpu_substep = false;
+        if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
 #if USE_CUDA_BSSN
        {
          double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
          if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
              !fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out))
          {
            cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          if (bssn_escalar_cuda_rk4_substep(cg,
                                            cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                            state_in, state_out,
                                            propspeed, soa_flat, Pp->data->bbox,
                                            dT_lev, TRK4, iter_count, apply_bam_bc,
                                            Symmetry, lev, ndeps, pre,
                                            keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA BSSN-EScalar predictor substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          used_gpu_substep = true;
        }
 #endif
        if (!used_gpu_substep &&
            f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                       cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -1146,8 +783,6 @@ void bssnEScalar_class::Step(int lev, int YN)
          ERROR = 1;
        }
        if (!used_gpu_substep)
        {
        // rk4 substep and boundary
        {
          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
@@ -1187,7 +822,6 @@ void bssnEScalar_class::Step(int lev, int YN)
          }
        }
        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
        }
      }
      if (BP == Pp->data->ble)
        break;
@@ -1211,8 +845,6 @@ void bssnEScalar_class::Step(int lev, int YN)
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  if (escalar_step_timing)
    escalar_t_rhs += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  // evolve Shell Patches
@@ -1361,14 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN)
  }
 #endif
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
  bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry);
 #else
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #endif
  if (escalar_step_timing)
    escalar_t_sync += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1391,11 +1016,6 @@ void bssnEScalar_class::Step(int lev, int YN)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false);
 #endif
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
@@ -1424,26 +1044,16 @@ void bssnEScalar_class::Step(int lev, int YN)
        DG_List->clearList();
      }
    }
    if (escalar_step_timing)
      escalar_t_bh += MPI_Wtime() - escalar_t0;
  }
  // data analysis part
  // Warning NOTE: the variables1 are used as temp storege room
  if (lev == a_lev)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    if (use_cuda_resident_sync)
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
 #endif
    AnalysisStuff_EScalar(lev, dT_lev);
    if (escalar_step_timing)
      escalar_t_analysis += MPI_Wtime() - escalar_t0;
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
@@ -1457,13 +1067,11 @@ void bssnEScalar_class::Step(int lev, int YN)
        if (myrank == cg->rank)
        {
 #if (AGM == 0)
 #if !USE_CUDA_BSSN
          f_enforce_ga(cg->shape,
                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
 #elif (AGM == 1)
          if (iter_count == 3)
            f_enforce_ga(cg->shape,
@@ -1473,50 +1081,7 @@ void bssnEScalar_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          bool used_gpu_substep = false;
+          if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
 #if USE_CUDA_BSSN
          {
            double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
            if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
                !fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out))
            {
              cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl;
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            int apply_bam_bc = 0;
            int apply_enforce_ga = 0;
 #if (AGM == 0)
            apply_enforce_ga = 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            if (bssn_escalar_cuda_rk4_substep(cg,
                                              cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                              state_in, state_out,
                                              propspeed, soa_flat, Pp->data->bbox,
                                              dT_lev, TRK4, iter_count, apply_bam_bc,
                                              Symmetry, lev, ndeps, cor,
                                              keep_resident_state, apply_enforce_ga, chitiny))
            {
              cout << "CUDA BSSN-EScalar corrector substep failed in domain: ("
                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
              ERROR = 1;
            }
            used_gpu_substep = true;
          }
 #endif
          if (!used_gpu_substep &&
              f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                         cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1560,8 +1125,6 @@ void bssnEScalar_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          if (!used_gpu_substep)
          {
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
@@ -1604,7 +1167,6 @@ void bssnEScalar_class::Step(int lev, int YN)
            }
          }
          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
          }
        }
        if (BP == Pp->data->ble)
          break;
@@ -1630,8 +1192,6 @@ void bssnEScalar_class::Step(int lev, int YN)
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    if (escalar_step_timing)
      escalar_t_rhs += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
    // evolve Shell Patches
@@ -1789,14 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN)
    }
 #endif
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #endif
    if (escalar_step_timing)
      escalar_t_sync += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
    if (lev == 0)
@@ -1818,11 +1371,6 @@ void bssnEScalar_class::Step(int lev, int YN)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
      if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
        bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
 #endif
      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
@@ -1851,13 +1399,10 @@ void bssnEScalar_class::Step(int lev, int YN)
          DG_List->clearList();
        }
      }
      if (escalar_step_timing)
        escalar_t_bh += MPI_Wtime() - escalar_t0;
    }
    // swap time level
    if (iter_count < 3)
    {
      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
      Pp = GH->PatL[lev];
      while (Pp)
      {
@@ -1901,29 +1446,12 @@ void bssnEScalar_class::Step(int lev, int YN)
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
      if (escalar_step_timing)
        escalar_t_swap += MPI_Wtime() - escalar_t0;
    }
  }
 #if USE_CUDA_BSSN
  if (use_cuda_resident_sync)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank,
                                             bssn_escalar_cuda_pre_rp_release_enabled());
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  RestrictProlong(lev, YN, BB);
  if (escalar_step_timing)
    escalar_t_rp += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1950,7 +1478,6 @@ void bssnEScalar_class::Step(int lev, int YN)
  //
  // OldStateList  old -----------
  // update
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -1985,25 +1512,6 @@ void bssnEScalar_class::Step(int lev, int YN)
      sPp = sPp->next;
    }
  }
 #endif
  #if USE_CUDA_BSSN
  bool release_after_sync = false;
  if (use_cuda_resident_sync && bssn_escalar_cuda_post_rp_download_level_enabled(lev))
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    release_after_sync = bssn_escalar_cuda_post_swap_release_enabled();
    bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, release_after_sync);
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
  if (use_cuda_resident_sync && !release_after_sync &&
      bssn_escalar_cuda_prune_after_swap_enabled())
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    bssn_escalar_cuda_keep_only_level_state(GH->PatL[lev], StateList, myrank);
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
 #endif
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -2015,14 +1523,6 @@ void bssnEScalar_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  if (escalar_step_timing)
  {
    escalar_t_swap += MPI_Wtime() - escalar_t0;
    bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0,
                               escalar_t_rhs, escalar_t_sync, escalar_t_bh,
                               escalar_t_analysis, escalar_t_swap,
                               escalar_t_resident, escalar_t_rp);
  }
 }
 //================================================================================================
@@ -2524,12 +2024,11 @@ void bssnEScalar_class::Interp_Constraint()
  }
  ofstream outfile;
-  char suffix[64];
+  char filename[50];
-  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
+  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename.c_str());
+  outfile.open(filename);
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, fR_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
@@ -2578,37 +2077,7 @@ void bssnEScalar_class::Constraint_Out()
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              bool used_cuda_constraints = false;
+              if (lev > 0)
 #if USE_CUDA_BSSN
              {
                double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
                if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in))
                {
                  cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl;
                  MPI_Abort(MPI_COMM_WORLD, 1);
                }
                double *constraint_out[8] = {
                    cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
                    cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                    cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
                    cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]};
                int lev_arg = lev;
                int sym_arg = Symmetry;
                double eps_arg = ndeps;
                if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                                          state_in, constraint_out,
                                                          sym_arg, lev_arg, eps_arg))
                {
                  cout << "CUDA BSSN-EScalar constraint compute failed in domain: ("
                       << cg->bbox[0] << ":" << cg->bbox[3] << ","
                       << cg->bbox[1] << ":" << cg->bbox[4] << ","
                       << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
                  MPI_Abort(MPI_COMM_WORLD, 1);
                }
                used_cuda_constraints = true;
              }
 #endif
              if (!used_cuda_constraints && lev > 0)
                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
@@ -2645,8 +2114,7 @@ void bssnEScalar_class::Constraint_Out()
                                           cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                           cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
                                           Symmetry, lev, ndeps, pre);
-              if (!used_cuda_constraints)
+              f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                      cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], 
                                      cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],
                                      cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -144,7 +144,7 @@ public:
       bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
                  int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
                  int a_levi, int maxli, int decni, double maxrexi, double drexi);
-       virtual ~bssn_class();
+       ~bssn_class();
       void Evolve(int Steps);
       void RecursiveStep(int lev);
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -1,56 +0,0 @@
 #ifndef BSSN_GPU_H_
 #define BSSN_GPU_H_
 #include "bssn_macro.h"
 #include "macrodef.h"
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
 #define GRID_DIM 256
 #define BLOCK_DIM 128
 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
 #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
 #define pow2(x) ((x) * (x))
 #define TimeBetween(a, b) ((b.tv_sec - a.tv_sec) + (b.tv_usec - a.tv_usec) / 1000000.0f)
 #define M_ metac.
 #define Mh_ meta->
 #define Ms_ metassc.
 #define Msh_ metass->
 // #define TIMING
 #define RHS_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
 /**  main function */
 int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
            double *X, double *Y, double *Z,
            double *chi, double *trK,
            double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
            double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
            double *Gamx, double *Gamy, double *Gamz,
            double *Lap, double *betax, double *betay, double *betaz,
            double *dtSfx, double *dtSfy, double *dtSfz,
            double *chi_rhs, double *trK_rhs,
            double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
            double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
            double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
            double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
            double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
            double *rho, double *Sx, double *Sy, double *Sz, double *Sxx,
            double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
            double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
            double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
            double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
            double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
            double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
            int &Symmetry, int &Lev, double &eps, int &co);
 int gpu_rhs_ss(RHS_SS_PARA);
 #define Z4C_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *TZ, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *TZ_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
 int gpu_rhs_z4c_ss(Z4C_SS_PARA);
 #endif
--- a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
+++ b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
@@ -20,14 +20,12 @@ using namespace std;
 __device__ volatile unsigned int global_count = 0;
 #ifdef RESULT_CHECK
 void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
 	free(data);
 }
 #endif
 __global__ void sub_symmetry_bd_ss_partF(int ord, double * func, double *funcc)
 {
@@ -155,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
 inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 }
 __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -249,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
 	//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
 	//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(1,fx,h_3D_SIZE[0]);
 			//compare_result_gpu(2,fy,h_3D_SIZE[0]);
 			//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -453,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 	//fderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(1,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	//fdderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(21,fh,h_3D_SIZE[2]);
 	sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
 			compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
 			compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -474,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 			compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
 			compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
 	sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
 			compare_result_gpu(2,fxy,h_3D_SIZE[0]);
 			compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -498,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
 inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 }
 __global__ void sub_lopsided_ss_part1(double * dst)
@@ -518,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
 inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
 {
 		sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
-		cudaDeviceSynchronize();
+		cudaThreadSynchronize();
 		sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-		cudaDeviceSynchronize();
+		cudaThreadSynchronize();
 }
 __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -592,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
 	}
 			//compare_result_gpu(10,f,h_3D_SIZE[0]);
 	sub_symmetry_bd_ss(3,f,fh,SoA1);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[3]);
 	sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 			//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
 }
@@ -1701,7 +1699,7 @@ void destroy_meta(Meta *meta,Metass *metass)
 	if(Msh_ gzz) cudaFree(Msh_ gzz);
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
-	if(Mh_ reta) cudaFree(Mh_ reta);
+	if(Mh_ reta) CUDA_SAFE_CALL(cudaFree(Mh_ reta));
 #endif	
@@ -1897,7 +1895,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//1.2 local Data
 	cudaMalloc((void**)&(Mh_ gxx), matrix_size * sizeof(double));
-	cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double));
+	CUDA_SAFE_CALL( cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double)));
 	cudaMalloc((void**)&(Mh_ gzz), matrix_size * sizeof(double));
 	cudaMalloc((void**)&(Mh_ chix), matrix_size * sizeof(double));
 	cudaMalloc((void**)&(Mh_ chiy), matrix_size * sizeof(double));
@@ -2162,7 +2160,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	double tmp_con2 = 1/Mass[0] - tmp_con;
 	cudaMemcpyToSymbol(C1, &tmp_con2, sizeof(double));
-	tmp_con2 = 1/Mass[1] - tmp_con;
+	double tmp_con2 = 1/Mass[1] - tmp_con;
 	cudaMemcpyToSymbol(C2, &tmp_con2, sizeof(double));
@@ -2235,7 +2233,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
  		if((sst == 2 || sst == 4) && abs[1] < dYh) 
  		{
  			ijkmin_h[1] = -2; 
-  			ijkmin3_h[1] = -3;
+  			ijkmin_h[1] = -3;
  		}
  		if((sst == 3 || sst == 5) && abs_Y_ex2 < dYh)
  		{
@@ -2289,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
 #ifdef TIMING1
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 //-------------get device info-------------------------------------
@@ -2308,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2324,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
 	compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2334,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2342,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	//cudaDeviceSynchronize();
+	//cudaThreadSynchronize();
 	//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
 	compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2425,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	}
 	if(co == 0){
 		compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaDeviceSynchronize();
+		cudaThreadSynchronize();
 		sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2434,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 		sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaDeviceSynchronize();
+		cudaThreadSynchronize();
 	}
 #if (ABV == 1)
@@ -2514,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//test kodis
 	//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
 #ifdef TIMING
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
@@ -2524,55 +2522,4 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	return 0;//TODO return
 }
 #if (ABEtype == 2)
 // Z4C Shell GPU: calls BSSN gpu_rhs_ss with trKd=trK+2*TZ, then applies
 // TZ_rhs = alpn1*Hcon/2 and constraint damping on CPU.
 int gpu_rhs_z4c_ss(Z4C_SS_PARA)
 {
    int matrix_size = ex[0] * ex[1] * ex[2];
    double k1 = 0.02, k2 = 0.0;
    double *trKd_host = new double[matrix_size];
    for (int _i = 0; _i < matrix_size; _i++)
        trKd_host[_i] = trK[_i] + 2.0 * TZ[_i];
    int result = gpu_rhs_ss(calledby, mpi_rank,
        ex, T, crho, sigma, R, X, Y, Z,
        drhodx, drhody, drhodz, dsigmadx, dsigmady, dsigmadz,
        dRdx, dRdy, dRdz,
        drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
        dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
        dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
        chi, trKd_host, dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        ham_Res, movx_Res, movy_Res, movz_Res,
        Gmx_Res, Gmy_Res, Gmz_Res,
        Symmetry, Lev, eps, sst, co);
    delete[] trKd_host;
    if (result != 0) return result;
    for (int _i = 0; _i < matrix_size; _i++) {
        double alp = Lap[_i] + 1.0;
        TZ_rhs[_i] = alp * ham_Res[_i] * 0.5;
        TZ_rhs[_i]  -= alp * (2.0 + k2) * k1 * TZ[_i];
        trK_rhs[_i] += alp * k1 * (1.0 - k2) * TZ[_i];
    }
    return 0;
 }
 #endif // ABEtype == 2
 #endif //WithShell
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -7,9 +7,6 @@ extern "C" {
 enum {
    BSSN_CUDA_STATE_COUNT = 24,
    BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
    BSSN_EM_CUDA_STATE_COUNT = 32,
    BSSN_EM_CUDA_SOURCE_COUNT = 4,
    BSSN_CUDA_MATTER_COUNT = 10
 };
@@ -58,54 +55,6 @@ int bssn_cuda_rk4_substep(void *block_tag,
                          int &apply_enforce_ga,
                          double &chitiny);
 int bssn_escalar_cuda_rk4_substep(void *block_tag,
                                  int *ex, double *X, double *Y, double *Z,
                                  double **state_host_in,
                                  double **state_host_out,
                                  const double *propspeed,
                                  const double *soa_flat,
                                  const double *bbox,
                                  double &dT,
                                  double &T,
                                  int &RK4,
                                  int &apply_bam_bc,
                                  int &Symmetry,
 	                                  int &Lev,
 	                                  double &eps,
 	                                  int &co,
 	                                  int &keep_resident_state,
 	                                  int &apply_enforce_ga,
 	                                  double &chitiny);
 int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
                                          double **state_host_in,
                                          double **constraint_host_out,
                                          int &Symmetry,
                                          int &Lev,
                                          double &eps);
 int bssn_em_cuda_rk4_substep(void *block_tag,
                             int *ex, double *X, double *Y, double *Z,
                             double **state_host_in,
                             double **state_host_out,
                             double **source_host,
                             const double *propspeed,
                             const double *soa_flat,
                             const double *bbox,
                             double &dT,
                             double &T,
                             int &RK4,
                             int &apply_bam_bc,
                             int &Symmetry,
                             int &Lev,
                             double &eps,
                             int &co,
                             int &keep_resident_state,
                             int &apply_enforce_ga,
                             double &chitiny);
 int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
@@ -124,37 +73,6 @@ int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out);
 int bssn_escalar_cuda_download_resident_state(void *block_tag,
                                              int *ex,
                                              double **state_host_out);
 int bssn_cuda_upload_resident_state_count(void *block_tag,
                                          int *ex,
                                          double **state_host_in,
                                          int state_count);
 int bssn_escalar_cuda_upload_resident_state(void *block_tag,
                                            int *ex,
                                            double **state_host_in);
 int bssn_cuda_keep_only_resident_state_count(void *block_tag,
                                             int *ex,
                                             double **state_host_key,
                                             int state_count);
 int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
                                               int *ex,
                                               double **state_host_key);
 int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
                                                       int *ex,
                                                       double **state_host_out,
                                                       int state_count);
 int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                int *ex,
                                                double **state_host_out);
 int bssn_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out);
@@ -165,45 +83,6 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int bssn_cuda_interp_state_point3(void *block_tag,
                                  int *ex,
                                  int state0,
                                  int state1,
                                  int state2,
                                  double x0,
                                  double y0,
                                  double z0,
                                  double dx,
                                  double dy,
                                  double dz,
                                  double px,
                                  double py,
                                  double pz,
                                  int ordn,
                                  int symmetry,
                                  double **state_host_key,
                                  const double *soa3,
                                  double *out3);
 int bssn_cuda_interp_host_two_fields(void *block_tag,
                                     int *ex,
                                     double *field0,
                                     double *field1,
                                     double x0,
                                     double y0,
                                     double z0,
                                     double dx,
                                     double dy,
                                     double dz,
                                     const double *px,
                                     const double *py,
                                     const double *pz,
                                     int npoints,
                                     int ordn,
                                     int symmetry,
                                     const double *soa6,
                                     double *out_interleaved);
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
@@ -211,15 +90,6 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  int state_index,
                                                                  double *host_buffer,
                                                                  int *ex,
                                                                  int i0, int j0, int k0,
                                                                  int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int state_count,
                                              double *host_buffer,
@@ -227,14 +97,6 @@ int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                             double **state_host_key,
                                                             int state_count,
                                                             double *host_buffer,
                                                             int *ex,
                                                             int i0, int j0, int k0,
                                                             int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
@@ -242,140 +104,6 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *host_buffer,
                                                                 int *ex,
                                                                 int i0, int j0, int k0,
                                                                 int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                int state_count,
                                                double *device_buffer,
                                                int *ex,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                               double **state_host_key,
                                                               int state_count,
                                                               double *device_buffer,
                                                               int *ex,
                                                               int i0, int j0, int k0,
                                                               int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int i0, int j0, int k0,
                                                    int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int i0, int j0, int k0,
                                                                   int sx, int sy, int sz);
 int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int segment_count,
                                                   const int *segment_meta);
 int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int segment_count,
                                                                  const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta,
                                                                      const double *state_soa);
 int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta);
 int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa);
 int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int sx, int sy, int sz,
                                                    int fi0, int fj0, int fk0);
 int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int sx, int sy, int sz,
                                                                   int fi0, int fj0, int fk0,
                                                                   const double *state_soa);
 int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int ii0, int jj0, int kk0,
                                                   int lbc_i, int lbc_j, int lbc_k);
 int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int ii0, int jj0, int kk0,
                                                                  int lbc_i, int lbc_j, int lbc_k,
                                                                  const double *state_soa);
 int bssn_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,
@@ -388,26 +116,12 @@ int bssn_cuda_upload_state_subset(void *block_tag,
                                  const int *state_indices,
                                  double **state_host_in);
 int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                       int *ex,
                                       int state_count,
                                       double **src1_host_key,
                                       double **src2_host_key,
                                       double **src3_host_key,
                                       double **dst_host_key,
                                       int source_count,
                                       int tindex);
 int bssn_cuda_has_resident_state(void *block_tag);
 void bssn_cuda_release_step_ctx(void *block_tag);
 #ifdef __cplusplus
 }
 // C++-only helpers declared for derived equation classes (Z4C, etc.)
 // Defined in bssn_class.C. Requires MyList, Patch, var from including TU.
 bool bssn_cuda_use_resident_sync(int lev);
 void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank);
 #endif
 #endif
--- a/AMSS_NCKU_source/checkpoint.C
+++ b/AMSS_NCKU_source/checkpoint.C
@@ -76,11 +76,8 @@ checkpoint::checkpoint(bool checked, const char fname[], int myrank) : filename(
 	I_Print = (myrank == 0);
-	size_t filename_len = out_dir.size() + strlen(fname) + 32;
+	int i = strlen(fname);
-#ifdef CHECKDETAIL
+	filename = new char[i+30];
 	filename_len += 32;
 #endif
 	filename = new char[filename_len];
 	// cout << filename << endl;
 	// cout << i << endl;
@@ -106,7 +103,7 @@ checkpoint::checkpoint(bool checked, const char fname[], int myrank) : filename(
 checkpoint::~checkpoint()
 {
 	CheckList->clearList();
-	if (filename)
+	if (I_Print)
 		delete[] filename;
 }
@@ -139,7 +136,7 @@ void checkpoint::writecheck_cgh(double time, cgh *GH)
 	if (I_Print)
 	{
 		// char fname[50];
-		char fname[4096]; 
+		char fname[50+50]; 
 		sprintf(fname, "%s_cgh.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -198,7 +195,7 @@ void checkpoint::readcheck_cgh(double &time, cgh *GH, int myrank, int nprocs, in
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	infile.open(fname);
@@ -300,7 +297,7 @@ void checkpoint::writecheck_sh(double time, ShellPatch *SH)
 	if (I_Print)
 	{
-		char fname[4096];
+		char fname[50];
 		sprintf(fname, "%s_sh.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -338,7 +335,7 @@ void checkpoint::readcheck_sh(ShellPatch *SH, int myrank)
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_sh.CHK", filename);
 	infile.open(fname);
@@ -393,7 +390,7 @@ void checkpoint::write_Black_Hole_position(int BH_num_input, int BH_num, double
 	if (I_Print)
 	{
-		char fname[4096];
+		char fname[50];
 		sprintf(fname, "%s_BHp.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -420,7 +417,7 @@ void checkpoint::read_Black_Hole_position(int &BH_num_input, int &BH_num, double
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	infile.open(fname);
@@ -464,7 +461,7 @@ void checkpoint::write_bssn(double LastDump, double Last2dDump, double LastAnas)
 	if (I_Print)
 	{
-		char fname[4096];
+		char fname[50];
 		sprintf(fname, "%s_bssn.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -484,7 +481,7 @@ void checkpoint::read_bssn(double &LastDump, double &Last2dDump, double &LastAna
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	infile.open(fname);
@@ -509,7 +506,7 @@ void checkpoint::write_bssn(double LastDump, double Last2dDump, double LastAnas)
 	ofstream outfile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -530,7 +527,7 @@ void checkpoint::read_bssn(double &LastDump, double &Last2dDump, double &LastAna
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	infile.open(fname);
@@ -554,7 +551,7 @@ void checkpoint::write_Black_Hole_position(int BH_num_input, int BH_num, double
 	ofstream outfile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -584,7 +581,7 @@ void checkpoint::read_Black_Hole_position(int &BH_num_input, int &BH_num, double
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	infile.open(fname);
@@ -631,7 +628,7 @@ void checkpoint::writecheck_cgh(double time, cgh *GH)
 	ofstream outfile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -741,7 +738,7 @@ void checkpoint::readcheck_cgh(double &time, cgh *GH, int myrank, int nprocs, in
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[4096]; 
+	char fname[50+50]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	infile.open(fname);
--- a/AMSS_NCKU_source/fd_cuda_helpers.cuh
+++ b/AMSS_NCKU_source/fd_cuda_helpers.cuh
@@ -1,412 +0,0 @@
 #ifndef AMSS_NCKU_FD_CUDA_HELPERS_CUH
 #define AMSS_NCKU_FD_CUDA_HELPERS_CUH
 #ifndef ghost_width
 #error "ghost_width must be defined before including fd_cuda_helpers.cuh"
 #endif
 #if ghost_width < 2 || ghost_width > 5
 #error "CUDA finite-difference helpers support ghost_width 2..5"
 #endif
 #define AMSS_FD_CENTER_RADIUS (ghost_width - 1)
 #define AMSS_FD_LK_RADIUS     (ghost_width)
 __device__ __forceinline__ int fd_axis_radius(int qF, int qminF, int qmaxF)
 {
 #if AMSS_FD_CENTER_RADIUS >= 4
    if (qF - 4 >= qminF && qF + 4 <= qmaxF) return 4;
 #endif
 #if AMSS_FD_CENTER_RADIUS >= 3
    if (qF - 3 >= qminF && qF + 3 <= qmaxF) return 3;
 #endif
 #if AMSS_FD_CENTER_RADIUS >= 2
    if (qF - 2 >= qminF && qF + 2 <= qmaxF) return 2;
 #endif
    if (qF - 1 >= qminF && qF + 1 <= qmaxF) return 1;
    return 0;
 }
 __device__ __forceinline__ int fd_common_radius(int iF, int jF, int kF,
                                                int iminF, int jminF, int kminF,
                                                int imaxF, int jmaxF, int kmaxF)
 {
    int r = fd_axis_radius(iF, iminF, imaxF);
    const int ry = fd_axis_radius(jF, jminF, jmaxF);
    const int rz = fd_axis_radius(kF, kminF, kmaxF);
    if (ry < r) r = ry;
    if (rz < r) r = rz;
    return r;
 }
 __device__ __forceinline__ double fd_first_coef(int r, int off)
 {
    switch (r) {
    case 1:
        if (off == -1) return -1.0;
        if (off ==  1) return  1.0;
        return 0.0;
    case 2:
        if (off == -2) return  1.0;
        if (off == -1) return -8.0;
        if (off ==  1) return  8.0;
        if (off ==  2) return -1.0;
        return 0.0;
    case 3:
        if (off == -3) return  -1.0;
        if (off == -2) return   9.0;
        if (off == -1) return -45.0;
        if (off ==  1) return  45.0;
        if (off ==  2) return  -9.0;
        if (off ==  3) return   1.0;
        return 0.0;
    case 4:
        if (off == -4) return    3.0;
        if (off == -3) return  -32.0;
        if (off == -2) return  168.0;
        if (off == -1) return -672.0;
        if (off ==  1) return  672.0;
        if (off ==  2) return -168.0;
        if (off ==  3) return   32.0;
        if (off ==  4) return   -3.0;
        return 0.0;
    default:
        return 0.0;
    }
 }
 __device__ __forceinline__ double fd_second_coef(int r, int off)
 {
    switch (r) {
    case 1:
        if (off == -1) return  1.0;
        if (off ==  0) return -2.0;
        if (off ==  1) return  1.0;
        return 0.0;
    case 2:
        if (off == -2) return  -1.0;
        if (off == -1) return  16.0;
        if (off ==  0) return -30.0;
        if (off ==  1) return  16.0;
        if (off ==  2) return  -1.0;
        return 0.0;
    case 3:
        if (off == -3) return    2.0;
        if (off == -2) return  -27.0;
        if (off == -1) return  270.0;
        if (off ==  0) return -490.0;
        if (off ==  1) return  270.0;
        if (off ==  2) return  -27.0;
        if (off ==  3) return    2.0;
        return 0.0;
    case 4:
        if (off == -4) return     -9.0;
        if (off == -3) return    128.0;
        if (off == -2) return  -1008.0;
        if (off == -1) return   8064.0;
        if (off ==  0) return -14350.0;
        if (off ==  1) return   8064.0;
        if (off ==  2) return  -1008.0;
        if (off ==  3) return    128.0;
        if (off ==  4) return     -9.0;
        return 0.0;
    default:
        return 0.0;
    }
 }
 __device__ __forceinline__ double fd_first_denom(int r)
 {
    return (r == 4) ? 840.0 : ((r == 3) ? 60.0 : ((r == 2) ? 12.0 : 2.0));
 }
 __device__ __forceinline__ double fd_second_denom(int r)
 {
    return (r == 4) ? 5040.0 : ((r == 3) ? 180.0 : ((r == 2) ? 12.0 : 1.0));
 }
 __device__ __forceinline__ double fd_fetch_axis(const double *src,
                                                int iF, int jF, int kF,
                                                int axis, int off,
                                                int SoA0, int SoA1, int SoA2)
 {
    if (axis == 0) iF += off;
    else if (axis == 1) jF += off;
    else kF += off;
    return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
 }
 __device__ __forceinline__ double fd_fetch_axis2(const double *src,
                                                 int iF, int jF, int kF,
                                                 int axis_a, int off_a,
                                                 int axis_b, int off_b,
                                                 int SoA0, int SoA1, int SoA2)
 {
    if (axis_a == 0) iF += off_a;
    else if (axis_a == 1) jF += off_a;
    else kF += off_a;
    if (axis_b == 0) iF += off_b;
    else if (axis_b == 1) jF += off_b;
    else kF += off_b;
    return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
 }
 __device__ __forceinline__ double fd_first_axis_radius(const double *src,
                                                       int iF, int jF, int kF,
                                                       int axis, int r, double h,
                                                       int SoA0, int SoA1, int SoA2)
 {
    if (r <= 0) return 0.0;
    double s = 0.0;
 #pragma unroll
    for (int off = -4; off <= 4; ++off) {
        const double c = fd_first_coef(r, off);
        if (c != 0.0) {
            s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
        }
    }
    return s / (fd_first_denom(r) * h);
 }
 __device__ __forceinline__ double fd_second_axis_radius(const double *src,
                                                        int iF, int jF, int kF,
                                                        int axis, int r, double h,
                                                        int SoA0, int SoA1, int SoA2)
 {
    if (r <= 0) return 0.0;
    double s = 0.0;
 #pragma unroll
    for (int off = -4; off <= 4; ++off) {
        const double c = fd_second_coef(r, off);
        if (c != 0.0) {
            s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
        }
    }
    return s / (fd_second_denom(r) * h * h);
 }
 __device__ __forceinline__ double fd_mixed_axis_radius(const double *src,
                                                       int iF, int jF, int kF,
                                                       int axis_a, int r_a, double h_a,
                                                       int axis_b, int r_b, double h_b,
                                                       int SoA0, int SoA1, int SoA2)
 {
    if (r_a <= 0 || r_b <= 0) return 0.0;
    double s = 0.0;
 #pragma unroll
    for (int off_a = -4; off_a <= 4; ++off_a) {
        const double ca = fd_first_coef(r_a, off_a);
        if (ca == 0.0) continue;
 #pragma unroll
        for (int off_b = -4; off_b <= 4; ++off_b) {
            const double cb = fd_first_coef(r_b, off_b);
            if (cb != 0.0) {
                s += ca * cb * fd_fetch_axis2(src, iF, jF, kF, axis_a, off_a,
                                              axis_b, off_b, SoA0, SoA1, SoA2);
            }
        }
    }
    return s / (fd_first_denom(r_a) * fd_first_denom(r_b) * h_a * h_b);
 }
 __device__ __forceinline__ void fd_compute_first3(const double *src,
                                                  int iF, int jF, int kF,
                                                  int iminF, int jminF, int kminF,
                                                  int imaxF, int jmaxF, int kmaxF,
                                                  int SoA0, int SoA1, int SoA2,
                                                  double &fx, double &fy, double &fz)
 {
 #if ghost_width == 3
    const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF);
    fx = fd_first_axis_radius(src, iF, jF, kF, 0, r, d_gp.dX, SoA0, SoA1, SoA2);
    fy = fd_first_axis_radius(src, iF, jF, kF, 1, r, d_gp.dY, SoA0, SoA1, SoA2);
    fz = fd_first_axis_radius(src, iF, jF, kF, 2, r, d_gp.dZ, SoA0, SoA1, SoA2);
 #else
    fx = fd_first_axis_radius(src, iF, jF, kF, 0, fd_axis_radius(iF, iminF, imaxF),
                              d_gp.dX, SoA0, SoA1, SoA2);
    fy = fd_first_axis_radius(src, iF, jF, kF, 1, fd_axis_radius(jF, jminF, jmaxF),
                              d_gp.dY, SoA0, SoA1, SoA2);
    fz = fd_first_axis_radius(src, iF, jF, kF, 2, fd_axis_radius(kF, kminF, kmaxF),
                              d_gp.dZ, SoA0, SoA1, SoA2);
 #endif
 }
 __device__ __forceinline__ void fd_compute_second6(const double *src,
                                                   int iF, int jF, int kF,
                                                   int iminF, int jminF, int kminF,
                                                   int imaxF, int jmaxF, int kmaxF,
                                                   int SoA0, int SoA1, int SoA2,
                                                   double &fxx, double &fxy, double &fxz,
                                                   double &fyy, double &fyz, double &fzz)
 {
 #if ghost_width == 3
    const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF);
    const int rx = r, ry = r, rz = r;
 #else
    const int rx = fd_axis_radius(iF, iminF, imaxF);
    const int ry = fd_axis_radius(jF, jminF, jmaxF);
    const int rz = fd_axis_radius(kF, kminF, kmaxF);
 #endif
    fxx = fd_second_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, SoA0, SoA1, SoA2);
    fyy = fd_second_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, SoA0, SoA1, SoA2);
    fzz = fd_second_axis_radius(src, iF, jF, kF, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
    fxy = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 1, ry, d_gp.dY, SoA0, SoA1, SoA2);
    fxz = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
    fyz = fd_mixed_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
 }
 __device__ __forceinline__ bool fd_lop_fits(int qF, int qminF, int qmaxF,
                                            int dir, int lo, int hi)
 {
    for (int off = lo; off <= hi; ++off) {
        const int q = qF + dir * off;
        if (q < qminF || q > qmaxF) return false;
    }
    return true;
 }
 __device__ __forceinline__ double fd_lop_fetch_sum(const double *src,
                                                   int iF, int jF, int kF,
                                                   int axis, int dir,
                                                   const double *coef,
                                                   int lo, int hi,
                                                   int SoA0, int SoA1, int SoA2)
 {
    double s = 0.0;
    for (int off = lo; off <= hi; ++off) {
        const double c = coef[off - lo];
        if (c != 0.0) {
            s += c * fd_fetch_axis(src, iF, jF, kF, axis, dir * off, SoA0, SoA1, SoA2);
        }
    }
    return s;
 }
 __device__ __forceinline__ double fd_lopsided_axis(const double *src,
                                                   int iF, int jF, int kF,
                                                   int axis, double speed,
                                                   int qF, int qminF, int qmaxF,
                                                   double h,
                                                   int SoA0, int SoA1, int SoA2)
 {
    if (speed == 0.0) return 0.0;
    const int dir = (speed > 0.0) ? 1 : -1;
    const double mag = (speed > 0.0) ? speed : -speed;
 #if ghost_width == 2
    if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 2)) {
        const double c[] = {-3.0, 4.0, -1.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 2, SoA0, SoA1, SoA2) / (2.0 * h);
    }
    if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 1)) {
        const double c[] = {-1.0, 1.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 1, SoA0, SoA1, SoA2) / (2.0 * h);
    }
    return 0.0;
 #elif ghost_width == 3
    if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 3)) {
        const double c[] = {-3.0, -10.0, 18.0, -6.0, 1.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 3, SoA0, SoA1, SoA2) / (12.0 * h);
    }
    const int r = fd_axis_radius(qF, qminF, qmaxF);
    return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
 #elif ghost_width == 4
    if (fd_lop_fits(qF, qminF, qmaxF, dir, -2, 4)) {
        const double c[] = {2.0, -24.0, -35.0, 80.0, -30.0, 8.0, -1.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -2, 4, SoA0, SoA1, SoA2) / (60.0 * h);
    }
    if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 5)) {
        const double c[] = {-10.0, -77.0, 150.0, -100.0, 50.0, -15.0, 2.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 5, SoA0, SoA1, SoA2) / (60.0 * h);
    }
    const int r = fd_axis_radius(qF, qminF, qmaxF);
    return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
 #else
    if (fd_lop_fits(qF, qminF, qmaxF, dir, -3, 5)) {
        const double c[] = {-5.0, 60.0, -420.0, -378.0, 1050.0, -420.0, 140.0, -30.0, 3.0};
        return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -3, 5, SoA0, SoA1, SoA2) / (840.0 * h);
    }
    const int r = fd_axis_radius(qF, qminF, qmaxF);
    return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
 #endif
 }
 __device__ __forceinline__ double fd_ko_coef(int r, int off)
 {
    const int a = off < 0 ? -off : off;
    if (r == 2) {
        if (a == 0) return 6.0;
        if (a == 1) return -4.0;
        if (a == 2) return 1.0;
    } else if (r == 3) {
        if (a == 0) return -20.0;
        if (a == 1) return 15.0;
        if (a == 2) return -6.0;
        if (a == 3) return 1.0;
    } else if (r == 4) {
        if (a == 0) return 70.0;
        if (a == 1) return -56.0;
        if (a == 2) return 28.0;
        if (a == 3) return -8.0;
        if (a == 4) return 1.0;
    } else if (r == 5) {
        if (a == 0) return -252.0;
        if (a == 1) return 210.0;
        if (a == 2) return -120.0;
        if (a == 3) return 45.0;
        if (a == 4) return -10.0;
        if (a == 5) return 1.0;
    }
    return 0.0;
 }
 __device__ __forceinline__ double fd_ko_axis(const double *src,
                                             int iF, int jF, int kF,
                                             int axis, int r,
                                             int SoA0, int SoA1, int SoA2)
 {
    double s = 0.0;
 #pragma unroll
    for (int off = -5; off <= 5; ++off) {
        if (off < -r || off > r) continue;
        const double c = fd_ko_coef(r, off);
        if (c != 0.0) {
            s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
        }
    }
    return s;
 }
 __device__ __forceinline__ double fd_ko_term(const double *src,
                                             int iF, int jF, int kF,
                                             int iminF, int jminF, int kminF,
                                             int imaxF, int jmaxF, int kmaxF,
                                             double eps_val,
                                             int SoA0, int SoA1, int SoA2)
 {
    const int r = AMSS_FD_LK_RADIUS;
    if (eps_val <= 0.0) return 0.0;
 #if ghost_width >= 4
    if (iF - r <= iminF || iF + r >= imaxF ||
        jF - r <= jminF || jF + r >= jmaxF ||
        kF - r <= kminF || kF + r >= kmaxF) {
        return 0.0;
    }
 #else
    if (iF - r < iminF || iF + r > imaxF ||
        jF - r < jminF || jF + r > jmaxF ||
        kF - r < kminF || kF + r > kmaxF) {
        return 0.0;
    }
 #endif
    double cof = 1.0;
 #pragma unroll
    for (int n = 0; n < 2 * r; ++n) cof *= 2.0;
    const double sign = (r & 1) ? 1.0 : -1.0;
    const double dx = fd_ko_axis(src, iF, jF, kF, 0, r, SoA0, SoA1, SoA2);
    const double dy = fd_ko_axis(src, iF, jF, kF, 1, r, SoA0, SoA1, SoA2);
    const double dz = fd_ko_axis(src, iF, jF, kF, 2, r, SoA0, SoA1, SoA2);
    return sign * eps_val * (dx / d_gp.dX + dy / d_gp.dY + dz / d_gp.dZ) / cof;
 }
 #endif
--- a/AMSS_NCKU_source/gpu_rhsSS_mem.h
+++ b/AMSS_NCKU_source/gpu_rhsSS_mem.h
@@ -1,6 +1,6 @@
 #ifndef GPU_MEM_H_
 #define GPU_MEM_H_
-#include "macrodef.h"
+#include "macrodef.fh"
 #ifdef WithShell
 struct Metass
@@ -48,8 +48,6 @@ struct Meta
 	double * Gamx_rhs,*Gamy_rhs,*Gamz_rhs;//out
 	double * Lap_rhs, *betax_rhs, *betay_rhs, *betaz_rhs;//out
 	double * dtSfx_rhs,*dtSfy_rhs,*dtSfz_rhs;//out
 	double * TZ;                                  //in  (Z4C)
 	double * TZ_rhs;                              //out (Z4C)
 	double * rho,*Sx,*Sy,*Sz			;	//in
 	double * Sxx,*Sxy,*Sxz,*Syy,*Syz,*Szz;	//in
@@ -134,8 +132,6 @@ __constant__ double SYM = 1.0;
 __constant__ double ANTI = -1.0; 
 __constant__ double FF = 0.75;
 __constant__ double eta = 2.0;
 __constant__ double kappa1_c = 0.02;
 __constant__ double kappa2_c = 0.0;
 __constant__ double F1o3;
 __constant__ double F2o3;
 __constant__ double F3o2 = 1.5;
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -13,15 +13,12 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(TOOLCHAIN),intel)
 OMP_FLAG = -qopenmp
 ifeq ($(PGO_MODE),instrument)
-## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
+## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
+              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
 ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
 ## PGO has been turned off, now tested and found to be negative optimization
@@ -29,23 +26,9 @@ else
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
+              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc $(MKL_INC)
 else
 ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
 ## PGO_MODE is ignored in this branch.
 OMP_FLAG = -mp
 CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
 f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
              $(MKL_INC) $(POLINT6_FLAG)
 TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
              -Dfortran3 -Dnewc $(MKL_INC)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -56,10 +39,6 @@ endif
 .C.o:
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 # ShellPatch.C uses OpenMP for setupintintstuff search loops
 ShellPatch.o: ShellPatch.C
 	${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
 .for.o:
 	$(f77) -c $< -o $@
@@ -67,15 +46,11 @@ ShellPatch.o: ShellPatch.C
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
-bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h fd_cuda_helpers.cuh
+bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # CUDA rewrite of BSSN Shell-Patch RHS (drop-in replacement for bssn_rhs_ss)
 bssn_gpu_rhs_ss.o: bssn_gpu_rhs_ss.cu bssn_gpu.h gpu_rhsSS_mem.h bssn_macro.h macrodef.fh
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # CUDA rewrite of Z4C Cartesian RHS
-z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h fd_cuda_helpers.cuh
+z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # C rewrite of BSSN RHS kernel and helpers
@@ -103,11 +78,17 @@ z4c_rhs_c.o: z4c_rhs_c.C
 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
@@ -131,7 +112,7 @@ else
 CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
 endif
-CFILES_CUDA_BSSN = bssn_rhs_cuda.o bssn_gpu_rhs_ss.o
+CFILES_CUDA_BSSN = bssn_rhs_cuda.o
 ifeq ($(USE_CUDA_BSSN),1)
 CFILES = $(CFILES_CUDA_BSSN)
@@ -261,7 +242,7 @@ ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 #	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,7 +1,28 @@
-## Toolchain selection
+## GCC version (commented out)
-##   nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
+## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-##   intel : Intel oneAPI toolchain (legacy path)
+## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-TOOLCHAIN ?= intel
+## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
@@ -22,14 +43,6 @@ else
 INTERP_LB_FLAGS =
 endif
 MKLROOT ?= /home/intel/oneapi/mkl/latest
 MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
 MKL_INC ?= -I$(MKLROOT)/include
 NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
 CUDA_HOME  ?= $(NVHPC_ROOT)/cuda
 CUDA_ARCH  ?= sm_80
 ## Kernel implementation switch
 ##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
 ##   0           : fall back to original Fortran kernels
@@ -45,47 +58,17 @@ USE_CXX_Z4C_KERNELS ?= 1
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(TOOLCHAIN),intel)
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 filein  = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
 LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
               -lifcore -limf -liomp5 -lpthread -lm -ldl \
               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
 else ifeq ($(TOOLCHAIN),nvhpc)
 f90          = mpifort
 f77          = mpifort
 CXX          = mpicxx
 CC           = mpicc
 CLINKER      = mpicxx
-filein       = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
+Cu = nvcc
-LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
+CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
-               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
+#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-               -lpthread -lm -ldl \
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
-               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
+CUDA_ARCH ?= sm_80
-               -fortranlibs
+ifneq ($(strip $(CUDA_ARCH)),)
 CUDA_APP_FLAGS += -arch=$(CUDA_ARCH)
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
 CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)
--- a/AMSS_NCKU_source/monitor.C
+++ b/AMSS_NCKU_source/monitor.C
@@ -1,7 +1,6 @@
 #ifdef newc
 #include <cstdio>
 #include <sstream>
 using namespace std;
 #else
 #include <stdio.h>
@@ -78,17 +77,16 @@ monitor::monitor(const char fname[], int myrank, string head)
      parameters::str_par.insert(map<string, string>::value_type("output dir", out_dir));
    }
    // considering checkpoint run
-    string filename = out_dir + "/" + fname;
+    char filename[50];
    sprintf(filename, "%s/%s", out_dir.c_str(), fname);
    int i = 1;
-    while ((access(filename.c_str(), F_OK)) != -1)
+    while ((access(filename, F_OK)) != -1)
    {
-      stringstream ss;
+      sprintf(filename, "%s/%d_%s", out_dir.c_str(), i, fname);
      ss << out_dir << "/" << i << "_" << fname;
      filename = ss.str();
      i++;
    }
-    outfile.open(filename.c_str(), ios::trunc);
+    outfile.open(filename, ios::trunc);
    time_t tnow;
    time(&tnow);
@@ -109,17 +107,16 @@ monitor::monitor(const char fname[], int myrank, const int out_rank, string head
  if (I_Print)
  {
    // considering checkpoint run
-    string filename = out_dir + "/" + fname;
+    char filename[50];
    sprintf(filename, "%s/%s", out_dir.c_str(), fname);
    int i = 1;
-    while ((access(filename.c_str(), F_OK)) != -1)
+    while ((access(filename, F_OK)) != -1)
    {
-      stringstream ss;
+      sprintf(filename, "%s/%d_%s", out_dir.c_str(), i, fname);
      ss << out_dir << "/" << i << "_" << fname;
      filename = ss.str();
      i++;
    }
-    outfile.open(filename.c_str(), ios::trunc);
+    outfile.open(filename, ios::trunc);
    time_t tnow;
    time(&tnow);
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -11,7 +11,6 @@
 #include <strstream>
 #include <cmath>
 #include <map>
 #include <cstdlib>
 using namespace std;
 #else
 #include <iostream.h>
@@ -33,20 +32,6 @@ using namespace std;
 #include "parameters.h"
 #define PI M_PI
 namespace
 {
 bool amss_surface_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_SURFACE_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 }
 //|============================================================================
 //| Constructor
 //|============================================================================
@@ -3296,8 +3281,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
                                         var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
                                         double *Rout, monitor *Monitor, bool refresh_mass_fields)
 {
  const bool timing = amss_surface_timing_enabled();
  const double t_start = timing ? MPI_Wtime() : 0.0;
  if (Symmetry != 0 && Symmetry != 1)
  {
    surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
@@ -3342,7 +3325,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
      Pp = Pp->next;
    }
  }
  const double t_refresh_done = timing ? MPI_Wtime() : 0.0;
  const int InList = 19;
  const int idx_rpsi4 = 0, idx_ipsi4 = 1;
@@ -3398,7 +3380,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
  double *shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  const double t_interp_done = timing ? MPI_Wtime() : 0.0;
  double *RP_out = new double[NN];
  double *IP_out = new double[NN];
@@ -3515,7 +3496,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
    if (Symmetry == 0)
      p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
  }
  const double t_integral_done = timing ? MPI_Wtime() : 0.0;
  for (int ii = 0; ii < NN; ii++)
  {
@@ -3554,7 +3534,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
    delete[] reduce_out;
    delete[] reduce_in;
  }
  const double t_reduce_done = timing ? MPI_Wtime() : 0.0;
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3586,19 +3565,6 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
  Rout[5] = sy;
  Rout[6] = sz;
  if (timing)
  {
    fprintf(stderr,
            "[AMSS-SURFACE][rank %d] rex=%.6g lev=%d refresh=%.6f interp=%.6f integral=%.6f reduce=%.6f total=%.6f nlocal=%d ntotal=%d modes=%d\n",
            myrank, rex, lev,
            t_refresh_done - t_start,
            t_interp_done - t_refresh_done,
            t_integral_done - t_interp_done,
            t_reduce_done - t_integral_done,
            t_reduce_done - t_start,
            Nmax - Nmin + 1, n_tot, NN);
  }
  delete[] pox[0];
  delete[] pox[1];
  delete[] pox[2];
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -266,8 +266,6 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
                    + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
 }
 #include "fd_cuda_helpers.cuh"
 /* ------------------------------------------------------------------ */
 /*  GPU buffer management                                              */
 /* ------------------------------------------------------------------ */
@@ -294,11 +292,12 @@ struct GpuBuffers {
    size_t  cap_fh3_size;
    int     prev_nx, prev_ny, prev_nz;
    bool    initialized;
    cudaStream_t stream;     /* dedicated transfer stream */
 };
 static GpuBuffers g_buf = {
    nullptr, nullptr, nullptr, nullptr, false, {},
-    0, 0, 0, 0, 0, 0, false
+    0, 0, 0, 0, 0, 0, false, nullptr
 };
 /* Slot assignments — INPUT (H2D) */
@@ -422,7 +421,6 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
 };
 __constant__ int d_subset_state_indices[BSSN_STATE_COUNT];
 __constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT];
 static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
     1,  1,  1,
@@ -598,6 +596,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
                        || (fh3_size > g_buf.cap_fh3_size);
    if (need_grow) {
        if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; }
        if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
        if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
        if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
@@ -625,6 +624,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
            }
        }
        if (!g_buf.stream)
            CUDA_CHECK(cudaStreamCreate(&g_buf.stream));
        g_buf.cap_all = all;
        g_buf.cap_fh2_size = fh2_size;
        g_buf.cap_fh3_size = fh3_size;
@@ -730,21 +732,6 @@ static void upload_grid_params_if_needed(const GridParams &gp)
    }
 }
 static void upload_comm_state_soa(const double *state_soa, int state_count)
 {
    double soa[3 * BSSN_STATE_COUNT];
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        soa[3 * i + 0] = 1.0;
        soa[3 * i + 1] = 1.0;
        soa[3 * i + 2] = 1.0;
    }
    if (state_soa) {
        const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT;
        std::memcpy(soa, state_soa, (size_t)3 * n * sizeof(double));
    }
    CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa)));
 }
 /* ================================================================== */
 /*  A. Symmetry boundary kernels (ord=2 and ord=3)                     */
 /* ================================================================== */
@@ -1437,10 +1424,45 @@ void kern_fderivs_batched(FDerivTables tables, int field_count)
    const int jF = j0 + 1;
    const int kF = k0 + 1;
-    fd_compute_first3(src, iF, jF, kF,
+    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
-                      iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
+        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
-                      SoA0, SoA1, SoA2,
+        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
-                      fx[tid], fy[tid], fz[tid]);
+    {
        fx[tid] = d_gp.d12dx * (
            fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
        fy[tid] = d_gp.d12dy * (
            fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
        fz[tid] = d_gp.d12dz * (
            fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
    }
    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
    {
        fx[tid] = d_gp.d2dx * (
            -fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
            +fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
        fy[tid] = d_gp.d2dy * (
            -fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
            +fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
        fz[tid] = d_gp.d2dz * (
            -fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
            +fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
    }
    else {
        fx[tid] = 0.0;
        fy[tid] = 0.0;
        fz[tid] = 0.0;
    }
 }
 __global__ __launch_bounds__(128, 4)
@@ -1480,12 +1502,6 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
    const int jF = j0 + 1;
    const int kF = k0 + 1;
 #if ghost_width != 3
    fd_compute_second6(src, iF, jF, kF,
                       iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                       SoA0, SoA1, SoA2,
                       fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
 #else
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
@@ -1613,43 +1629,12 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
        fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
    }
 #endif
 }
 static void gpu_fderivs_batch(int field_count,
                              double *const *src_fields,
                              double *const *fx_fields,
                              double *const *fy_fields,
                              double *const *fz_fields,
                              const int *soa_signs,
                              int all);
 static void gpu_fdderivs_batch(int field_count,
                               double *const *src_fields,
                               double *const *fxx_fields,
                               double *const *fxy_fields,
                               double *const *fxz_fields,
                               double *const *fyy_fields,
                               double *const *fyz_fields,
                               double *const *fzz_fields,
                               const int *soa_signs,
                               int all);
 static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                                            double *d_Sfx, double *d_Sfy, double *d_Sfz,
                                            double SoA0, double SoA1, double SoA2,
                                            double eps_val, int all);
 /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
 static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
                        double SoA0, double SoA1, double SoA2, int all)
 {
 #if ghost_width != 3
    double *src_fields[1] = {d_f};
    double *fx_fields[1] = {d_fx};
    double *fy_fields[1] = {d_fy};
    double *fz_fields[1] = {d_fz};
    const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
    gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
 #else
    double *fh = g_buf.d_fh2;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
@@ -1658,7 +1643,6 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
 #endif
 }
 /* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
@@ -1667,18 +1651,6 @@ static void gpu_fdderivs(double *d_f,
                         double *d_fyy, double *d_fyz, double *d_fzz,
                         double SoA0, double SoA1, double SoA2, int all)
 {
 #if ghost_width != 3
    double *src_fields[1] = {d_f};
    double *fxx_fields[1] = {d_fxx};
    double *fxy_fields[1] = {d_fxy};
    double *fxz_fields[1] = {d_fxz};
    double *fyy_fields[1] = {d_fyy};
    double *fyz_fields[1] = {d_fyz};
    double *fzz_fields[1] = {d_fzz};
    const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
    gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
                       fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
 #else
    double *fh = g_buf.d_fh2;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
@@ -1687,7 +1659,6 @@ static void gpu_fdderivs(double *d_f,
    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
 #endif
 }
 static void gpu_fderivs_batch(int field_count,
@@ -1777,12 +1748,6 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
        const int jF = j0 + 1;
        const int kF = k0 + 1;
 #if ghost_width != 3
        fd_compute_second6(src, iF, jF, kF,
                           iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                           SoA0, SoA1, SoA2,
                           fxx, fxy, fxz, fyy, fyz, fzz);
 #else
        if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
            (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
            (kF + 2) <= kmaxF && (kF - 2) >= kminF)
@@ -1906,7 +1871,6 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
              - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
              + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
        }
 #endif
    }
    dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
@@ -1971,16 +1935,6 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
    const int jF = j0 + 1;
    const int kF = k0 + 1;
 #if ghost_width != 3
    fd_compute_second6(Lap, iF, jF, kF,
                       iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                       1, 1, 1,
                       fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
    fd_compute_first3(chi, iF, jF, kF,
                      iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                      1, 1, 1,
                      chix_out[tid], chiy_out[tid], chiz_out[tid]);
 #else
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
@@ -2134,7 +2088,6 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
        chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
    }
 #endif
 }
 /* Combined ord=3 advection + KO dissipation.
@@ -2146,11 +2099,6 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                               double SoA0, double SoA1, double SoA2,
                               double eps_val, int all)
 {
 #if ghost_width != 3
    gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
                                    d_Sfx, d_Sfy, d_Sfz,
                                    SoA0, SoA1, SoA2, eps_val, all);
 #else
    double *fh = g_buf.d_fh3;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
@@ -2166,7 +2114,6 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
        }
        kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
    }
 #endif
 }
 __global__ __launch_bounds__(128, 4)
@@ -2199,24 +2146,6 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
    const int jF = j0 + 1;
    const int kF = k0 + 1;
 #if ghost_width != 3
    if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
        const double val =
            fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
                             d_gp.dX, SoA0, SoA1, SoA2)
          + fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
                             d_gp.dY, SoA0, SoA1, SoA2)
          + fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
                             d_gp.dZ, SoA0, SoA1, SoA2);
        rhs[tid] += val;
    }
    if (do_kodis) {
        rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
                               iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                               eps_val, SoA0, SoA1, SoA2);
    }
 #else
    if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
        double val = 0.0;
@@ -2399,25 +2328,6 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
        rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
    }
 #endif
 }
 static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                                            double *d_Sfx, double *d_Sfy, double *d_Sfz,
                                            double SoA0, double SoA1, double SoA2,
                                            double eps_val, int all)
 {
    LopsidedKodisTables tables = {};
    tables.adv_fields[0] = d_f_adv;
    tables.ko_fields[0] = d_f_ko;
    tables.rhs_fields[0] = d_f_rhs;
    tables.soa_signs[0] = (int)SoA0;
    tables.soa_signs[1] = (int)SoA1;
    tables.soa_signs[2] = (int)SoA2;
    dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
        d_Sfx, d_Sfy, d_Sfz, tables, eps_val, 1, eps_val > 0.0 ? 1 : 0);
 }
 static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
@@ -3968,12 +3878,6 @@ void kern_phase12_13_chi_correction_fused(
        const int jF = j0 + 1;
        const int kF = k0 + 1;
 #if ghost_width != 3
        fd_compute_second6(chi, iF, jF, kF,
                           iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                           1, 1, 1,
                           cxx, cxy, cxz, cyy, cyz, czz);
 #else
        if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
            (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
            (kF + 2) <= kmaxF && (kF - 2) >= kminF)
@@ -4097,7 +4001,6 @@ void kern_phase12_13_chi_correction_fused(
              - fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
              + fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
        }
 #endif
    }
    const double cx = chix[tid];
@@ -4268,12 +4171,6 @@ void kern_phase15_trK_Aij_gauge(
        double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
        if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
 #if ghost_width != 3
            fd_compute_second6(alpn1, iF, jF, kF,
                               iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                               1, 1, 1,
                               fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
 #else
            if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                (kF + 2) <= kmaxF && (kF - 2) >= kminF)
@@ -4397,7 +4294,6 @@ void kern_phase15_trK_Aij_gauge(
                  - fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
                  + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
            }
 #endif
        }
        /* raised chi/chi */
@@ -4735,15 +4631,15 @@ static void setup_grid_params(int *ex,
    gp.imaxF = nx;
    gp.jmaxF = ny;
    gp.kmaxF = nz;
-    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
-    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
-    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
    gp.iminF3 = 1;
    gp.jminF3 = 1;
    gp.kminF3 = 1;
-    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
-    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
-    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
    gp.Symmetry = Symmetry;
    gp.eps = eps;
    gp.co = co;
@@ -4788,9 +4684,9 @@ static void upload_state_inputs(double **state_host, size_t all)
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
    }
-    CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
+    CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage,
-                          (size_t)BSSN_STATE_COUNT * bytes,
+                               (size_t)BSSN_STATE_COUNT * bytes,
-                          cudaMemcpyHostToDevice));
+                               cudaMemcpyHostToDevice, g_buf.stream));
 }
 static void upload_matter_cache(StepContext &ctx,
@@ -4801,9 +4697,9 @@ static void upload_matter_cache(StepContext &ctx,
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
    }
-    CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
+    CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage,
-                          (size_t)BSSN_MATTER_COUNT * bytes,
+                               (size_t)BSSN_MATTER_COUNT * bytes,
-                          cudaMemcpyHostToDevice));
+                               cudaMemcpyHostToDevice, g_buf.stream));
    ctx.matter_ready = true;
 }
@@ -5131,9 +5027,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
 static void download_state_outputs(double **state_host_out, size_t all)
 {
    const size_t bytes = all * sizeof(double);
-    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
+    CUDA_CHECK(cudaStreamSynchronize(0));
-                          (size_t)BSSN_STATE_COUNT * bytes,
+    CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs],
-                          cudaMemcpyDeviceToHost));
+                               (size_t)BSSN_STATE_COUNT * bytes,
                               cudaMemcpyDeviceToHost, g_buf.stream));
    CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
@@ -5142,9 +5040,11 @@ static void download_state_outputs(double **state_host_out, size_t all)
 static void download_constraint_outputs(double **constraint_host_out, size_t all)
 {
    const size_t bytes = all * sizeof(double);
-    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
+    CUDA_CHECK(cudaStreamSynchronize(0));
-                          (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
+    CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res],
-                          cudaMemcpyDeviceToHost));
+                               (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
                               cudaMemcpyDeviceToHost, g_buf.stream));
    CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
    for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
        std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
@@ -5198,196 +5098,6 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
    }
 }
 __device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem,
                                                           int state_index,
                                                           int x, int y, int z,
                                                           int nx, int ny,
                                                           int all)
 {
    double s = 1.0;
    if (x < 0) {
        x = -x;
        s *= d_comm_state_soa[3 * state_index + 0];
    }
    if (y < 0) {
        y = -y;
        s *= d_comm_state_soa[3 * state_index + 1];
    }
    if (z < 0) {
        z = -z;
        s *= d_comm_state_soa[3 * state_index + 2];
    }
    const int src = x + y * nx + z * nx * ny;
    return s * src_mem[(size_t)state_index * all + src];
 }
 __global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem,
                                                 double * __restrict__ dst,
                                                 int nx, int ny,
                                                 int sx, int sy, int sz,
                                                 int fi0, int fj0, int fk0,
                                                 int region_all,
                                                 int state_count,
                                                 int all)
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
 #if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
    const double c4 = 1225.0 / 2048.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
 #elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fc_i = fi0 + 2 * ii;
        const int fc_j = fj0 + 2 * jj;
        const int fc_k = fk0 + 2 * kk;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz) {
            const int z = fc_k + offs[oz];
            const double wz = w[oz];
            for (int oy = 0; oy < nst; ++oy) {
                const int y = fc_j + offs[oy];
                const double wyz = wz * w[oy];
                for (int ox = 0; ox < nst; ++ox) {
                    const int x = fc_i + offs[ox];
                    sum += wyz * w[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)state_index * region_all + local] = sum;
    }
 }
 __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem,
                                                double * __restrict__ dst,
                                                int nx, int ny,
                                                int sx, int sy, int sz,
                                                int ii0, int jj0, int kk0,
                                                int lbc_i, int lbc_j, int lbc_k,
                                                int region_all,
                                                int state_count,
                                                int all)
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
 #if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
    const double c4 = 225225.0 / 262144.0;
    const double c5 = 75075.0 / 262144.0;
    const double c6 = -19305.0 / 262144.0;
    const double c7 = 4095.0 / 262144.0;
    const double c8 = -429.0 / 262144.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
 #elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
    const double c4 = 1155.0 / 4096.0;
    const double c5 = -495.0 / 8192.0;
    const double c6 = 63.0 / 8192.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fine_i = ii0 + ii;
        const int fine_j = jj0 + jj;
        const int fine_k = kk0 + kk;
        const int ci = fine_i / 2 - lbc_i;
        const int cj = fine_j / 2 - lbc_j;
        const int ck = fine_k / 2 - lbc_k;
        const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr;
        const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr;
        const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz) {
            const int z = ck + offs[oz];
            const double wzv = wz[oz];
            for (int oy = 0; oy < nst; ++oy) {
                const int y = cj + offs[oy];
                const double wyz = wzv * wy[oy];
                for (int ox = 0; ox < nst; ++ox) {
                    const int x = ci + offs[ox];
                    sum += wyz * wx[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)state_index * region_all + local] = sum;
    }
 }
 __global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
                                       double * __restrict__ dst,
                                       int subset_count,
@@ -5523,36 +5233,6 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
    }
 }
 static void copy_state_region_packed_batch_device_cuda(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       const int *ex,
                                                       int i0, int j0, int k0,
                                                       int sx, int sy, int sz,
                                                       int pack_not_unpack)
 {
    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const int region_all = sx * sy * sz;
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    if (pack_not_unpack) {
        kern_pack_state_region_batch<<<launch_grid, BLK>>>(
            ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
            i0, j0, k0, sx, sy, sz, region_all, state_count,
            ex[0] * ex[1] * ex[2]);
    } else {
        kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
            ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
            i0, j0, k0, sx, sy, sz, region_all, state_count,
            ex[0] * ex[1] * ex[2]);
        ctx.state_ready = true;
    }
 }
 static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
 {
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
@@ -7635,9 +7315,9 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
    }
    if (RK4 == 0) {
-        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
+        CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi],
-                              (size_t)BSSN_STATE_COUNT * bytes,
+                                   (size_t)BSSN_STATE_COUNT * bytes,
-                              cudaMemcpyDeviceToDevice));
+                                   cudaMemcpyDeviceToDevice, g_buf.stream));
    }
    if (profile) {
        cuda_profile_sync();
@@ -7780,90 +7460,6 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
    return 0;
 }
 extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                          int state_count,
                                                          double *device_buffer,
                                                          int *ex,
                                                          int i0, int j0, int k0,
                                                          int sx, int sy, int sz)
 {
    using namespace z4c_cuda;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
                                               i0, j0, k0, sx, sy, sz, 1);
    return 0;
 }
 extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                              int state_count,
                                                              double *device_buffer,
                                                              int *ex,
                                                              int i0, int j0, int k0,
                                                              int sx, int sy, int sz)
 {
    using namespace z4c_cuda;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
                                               i0, j0, k0, sx, sy, sz, 0);
    return 0;
 }
 extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                              int state_count,
                                                              double *device_buffer,
                                                              int *ex,
                                                              int sx, int sy, int sz,
                                                              int fi0, int fj0, int fk0,
                                                              const double *state_soa)
 {
    using namespace z4c_cuda;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(state_soa, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_state_curr_mem, device_buffer,
        ex[0], ex[1], sx, sy, sz,
        fi0, fj0, fk0, region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
 }
 extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                             int state_count,
                                                             double *device_buffer,
                                                             int *ex,
                                                             int sx, int sy, int sz,
                                                             int ii0, int jj0, int kk0,
                                                             int lbc_i, int lbc_j, int lbc_k,
                                                             const double *state_soa)
 {
    using namespace z4c_cuda;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(state_soa, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_state_curr_mem, device_buffer,
        ex[0], ex[1], sx, sy, sz,
        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
        region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
 }
 extern "C" int z4c_cuda_download_state_subset(void *block_tag,
                                              int *ex,
                                              int subset_count,
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -60,37 +60,6 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                               int state_count,
                                               double *device_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int fi0, int fj0, int fk0,
                                                   const double *state_soa);
 int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                  int state_count,
                                                  double *device_buffer,
                                                  int *ex,
                                                  int sx, int sy, int sz,
                                                  int ii0, int jj0, int kk0,
                                                  int lbc_i, int lbc_j, int lbc_k,
                                                  const double *state_soa);
 int z4c_cuda_download_state_subset(void *block_tag,
                                   int *ex,
                                   int subset_count,
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,8 +9,6 @@
 import AMSS_NCKU_Input as input_data
 import os
 import shutil
 import subprocess
 import time
@@ -58,157 +56,6 @@ BUILD_JOBS = 64
 ##################################################################
 def _truthy(value, default=False):
    if value is None:
        return default
    if isinstance(value, bool):
        return value
    text = str(value).strip().lower()
    if text == "":
        return default
    return text in ("1", "yes", "y", "true", "on", "enable", "enabled")
 def _input_or_env(input_name, env_name, default=None):
    if env_name in os.environ:
        return os.environ[env_name]
    return getattr(input_data, input_name, default)
 def _start_cuda_mps_if_requested(runtime_env):
    if input_data.GPU_Calculation != "yes":
        return False
    default_auto_mps = int(getattr(input_data, "MPI_processes", 1)) > 1
    auto_mps = _truthy(
        _input_or_env("CUDA_Auto_MPS", "AMSS_CUDA_AUTO_MPS", default_auto_mps),
        default=default_auto_mps,
    )
    if not auto_mps:
        return False
    mps_control = shutil.which("nvidia-cuda-mps-control")
    if not mps_control:
        print(" CUDA MPS control command was not found; running without MPS.")
        return False
    uid = os.getuid()
    pipe_dir = str(_input_or_env("CUDA_MPS_PIPE_DIRECTORY", "CUDA_MPS_PIPE_DIRECTORY",
                                 f"/tmp/amss-ncku-mps-{uid}"))
    log_dir = str(_input_or_env("CUDA_MPS_LOG_DIRECTORY", "CUDA_MPS_LOG_DIRECTORY",
                                f"/tmp/amss-ncku-mps-log-{uid}"))
    os.makedirs(pipe_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    mps_env = runtime_env.copy()
    mps_env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
    mps_env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
    if os.path.exists(os.path.join(pipe_dir, "control")):
        runtime_env.update({
            "CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
            "CUDA_MPS_LOG_DIRECTORY": log_dir,
        })
        print(f" Reusing CUDA MPS daemon: {pipe_dir}")
        return False
    print(f" Starting CUDA MPS daemon for this run: {pipe_dir}")
    result = subprocess.run([mps_control, "-d"], env=mps_env, text=True,
                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    if result.returncode != 0:
        print(" CUDA MPS daemon did not start; running without MPS.")
        if result.stdout:
            print(result.stdout, end="")
        return False
    runtime_env.update({
        "CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
        "CUDA_MPS_LOG_DIRECTORY": log_dir,
    })
    return True
 def _stop_cuda_mps(runtime_env):
    mps_control = shutil.which("nvidia-cuda-mps-control")
    if not mps_control:
        return
    subprocess.run([mps_control], input="quit\n", env=runtime_env, text=True,
                   stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 def _gpu_runtime_env():
    runtime_env = os.environ.copy()
    finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip()
    defaults = {
        "AMSS_EVOLVE_TIMING": "1",
        "AMSS_ESCALAR_STEP_TIMING": "0",
        "AMSS_INTERP_FAST": "1",
        "AMSS_INTERP_GPU": "1",
        "AMSS_ANALYSIS_MAP_EVERY": "1000000",
        "AMSS_CUDA_AWARE_MPI": "1",
        "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
        "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
        "AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
        "AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
        "AMSS_CUDA_EM_CACHE_SOURCES": "1",
        "AMSS_CUDA_EM_ZERO_FASTPATH": "1",
        "AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1",
        "AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH": "1",
        "AMSS_CUDA_AMR_HOST_STAGED": "1",
        "AMSS_CUDA_AMR_RESTRICT_DEVICE": "0",
        "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
        "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
        "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
        "AMSS_SHELL_FAST_INTERP": "0",
        "AMSS_SHELL_PARALLEL_INTERP": "0",
        "AMSS_SHELL_CUDA_INTERP": "0",
    }
    if finite_difference in ("2nd-order", "8th-order"):
        defaults.update({
            "AMSS_INTERP_FAST": "0",
            "AMSS_INTERP_GPU": "0",
            "AMSS_CUDA_AWARE_MPI": "0",
        })
    if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
        defaults.update({
            "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
            "AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
            "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
        })
    if getattr(input_data, "basic_grid_set", "") == "Shell-Patch":
        defaults.update({
            "AMSS_CUDA_AWARE_MPI": "0",
            "AMSS_SHELL_FAST_INTERP": "1",
            "AMSS_SHELL_PARALLEL_INTERP": "1",
            "AMSS_SHELL_INTERP_THREADS": "16",
        })
    if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
        defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
    if getattr(input_data, "Equation_Class", "") == "Z4C":
        defaults.update({
            "AMSS_Z4C_CUDA_RESIDENT": "1",
            "AMSS_CONSTRAINT_OUT_EVERY": "1000000",
        })
    for key, value in defaults.items():
        runtime_env.setdefault(key, value)
    optional_overrides = {
        "AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare",
        "AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit",
        "AMSS_INTERP_FAST_COMPARE_TOL": "AMSS_Interp_Fast_Compare_Tol",
        "AMSS_GPU_STAGE_TIMING": "AMSS_GPU_Stage_Timing",
        "AMSS_GPU_STAGE_TIMING_EVERY": "AMSS_GPU_Stage_Timing_Every",
    }
    for env_name, input_name in optional_overrides.items():
        if env_name not in runtime_env and hasattr(input_data, input_name):
            runtime_env[env_name] = str(getattr(input_data, input_name))
    return runtime_env
 ##################################################################
 ##################################################################
@@ -298,84 +145,29 @@ def run_ABE():
    print(                                                      )
    ## Define the command to run; cast other values to strings as needed
    mpi_env = None
    started_mps = False
    mpi_processes = int(input_data.MPI_processes)
    if (input_data.GPU_Calculation == "yes" and
        getattr(input_data, "Equation_Class", "") == "Z4C"):
        z4c_env_np = os.environ.get("AMSS_Z4C_GPU_MPI_PROCESSES")
        if z4c_env_np and int(z4c_env_np) > 0:
            mpi_processes = int(z4c_env_np)
        elif mpi_processes < 4:
            mpi_processes = 4
    if (input_data.GPU_Calculation == "yes" and
        getattr(input_data, "basic_grid_set", "") == "Shell-Patch"):
        shell_env_np = os.environ.get("AMSS_SHELL_GPU_MPI_PROCESSES")
        if shell_env_np and int(shell_env_np) > 0:
            mpi_processes = int(shell_env_np)
        elif mpi_processes < 4:
            mpi_processes = 4
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(mpi_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " I_MPI_OFFLOAD=1 I_MPI_OFFLOAD_IPC=0 mpirun -np " + str(mpi_processes) + " ./ABE_CUDA"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
        mpi_command_outfile = "ABEGPU_out.log"
        mpi_env = _gpu_runtime_env()
        started_mps = _start_cuda_mps_if_requested(mpi_env)
        print(" GPU optimized runtime switches:")
        print(f"   MPI processes={mpi_processes}")
        print(f"   AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}")
        print(f"   AMSS_INTERP_GPU={mpi_env.get('AMSS_INTERP_GPU', '')}")
        print(f"   AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}")
        print(f"   AMSS_EVOLVE_TIMING={mpi_env.get('AMSS_EVOLVE_TIMING', '')}")
        print(f"   AMSS_ESCALAR_STEP_TIMING={mpi_env.get('AMSS_ESCALAR_STEP_TIMING', '')}")
        print(f"   AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
        print(f"   AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
        print(f"   AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
        print(f"   AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
        print(f"   AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
        print(f"   AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}")
        print(f"   AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}")
        print(f"   AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}")
        print(f"   AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH={mpi_env.get('AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH', '')}")
        print(f"   AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
        print(f"   AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
        print(f"   AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}")
        print(f"   AMSS_SHELL_FAST_INTERP={mpi_env.get('AMSS_SHELL_FAST_INTERP', '')}")
        print(f"   AMSS_SHELL_PARALLEL_INTERP={mpi_env.get('AMSS_SHELL_PARALLEL_INTERP', '')}")
        print(f"   AMSS_SHELL_CUDA_INTERP={mpi_env.get('AMSS_SHELL_CUDA_INTERP', '')}")
        print(f"   AMSS_SHELL_INTERP_THREADS={mpi_env.get('AMSS_SHELL_INTERP_THREADS', '')}")
        print(f"   AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}")
        print(f"   AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}")
        if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
            print(f"   CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")
-    try:
+    ## Execute the MPI command and stream output
-        ## Execute the MPI command and stream output
+    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT, text=True, env=mpi_env)
-        ## Write ABE run output to file while printing to stdout
+    ## Write ABE run output to file while printing to stdout
-        with open(mpi_command_outfile, 'w') as file0:
+    with open(mpi_command_outfile, 'w') as file0:  
-            ## Read and print output lines; also write each line to file
+        ## Read and print output lines; also write each line to file
-            for line in mpi_process.stdout:
+        for line in mpi_process.stdout:
-                print(line, end='')  # stream output in real time
+            print(line, end='')  # stream output in real time
-                file0.write(line)    # write the line to file
+            file0.write(line)    # write the line to file
-                file0.flush()        # flush to ensure each line is written immediately (optional)
+            file0.flush()        # flush to ensure each line is written immediately (optional)            
    file0.close()
-        ## Wait for the process to finish
+    ## Wait for the process to finish
-        mpi_return_code = mpi_process.wait()
+    mpi_return_code = mpi_process.wait()
        if mpi_return_code != 0:
            raise subprocess.CalledProcessError(mpi_return_code, mpi_command)
    finally:
        if started_mps:
            _stop_cuda_mps(mpi_env)
    print(                                           )
    print( " The ABE/ABEGPU simulation is finished " ) 
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -822,21 +822,8 @@ def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ):
    print( " Begin the constraint violation plot for grid level number =  ", input_level_number )
    if (not os.path.exists(file0)) or os.path.getsize(file0) == 0:
        if ( input_level_number == 0 ):
            print( " Constraint data file is empty; skip constraint violation plots" )
            print(                                                            )
        return
    # load the full data file (assumed whitespace-separated floats)
    data = numpy.loadtxt(file0)
    data = numpy.atleast_2d(data)
    if data.shape[1] < 8:
        if ( input_level_number == 0 ):
            print( " Constraint data file has insufficient columns; skip constraint violation plots" )
            print(                                                                                )
        return
    # extract columns from the constraint data file
    time          = data[:,0]