diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C
index 79ed62b..1775761 100644
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -1,10 +1,10 @@
 
 #ifdef newc
-#include <sstream>
-#include <cstdio>
-#include <map>
-#include <string>
-using namespace std;
+#include <sstream>
+#include <cstdio>
+#include <map>
+#include <string>
+using namespace std;
 #else
 #include <stdio.h>
 #include <map.h>
@@ -27,20 +27,23 @@ using namespace std;
 #include "shellfunctions.h"
 #include "cpbc.h"
 #include "kodiss.h"
-#include "parameters.h"
-
-#ifndef USE_CUDA_Z4C
-#define USE_CUDA_Z4C 0
-#endif
-
-#if USE_CUDA_Z4C && (ABEtype == 2)
-#include "z4c_rhs_cuda.h"
-#endif
-
-#ifdef With_AHF
-#include "derivatives.h"
-#include "myglobal.h"
-#endif
+#include "parameters.h"
+
+#ifndef USE_CUDA_Z4C
+#define USE_CUDA_Z4C 0
+#endif
+
+#if USE_CUDA_Z4C && (ABEtype == 2)
+#include "z4c_rhs_cuda.h"
+#endif
+#if USE_CUDA_BSSN
+#include "bssn_rhs_cuda.h"
+#endif
+
+#ifdef With_AHF
+#include "derivatives.h"
+#include "myglobal.h"
+#endif
 
 //================================================================================================
 
@@ -136,19 +139,19 @@ void Z4c_class::Initialize()
   {
     CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs);
   }
-  else
-  {
-    PhysTime = StartTime;
-    Setup_Black_Hole_position();
-  }
-
-  sync_cache_pre = new Parallel::SyncCache[GH->levels];
-  sync_cache_cor = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
-  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
-  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
-}
+  else
+  {
+    PhysTime = StartTime;
+    Setup_Black_Hole_position();
+  }
+
+  sync_cache_pre = new Parallel::SyncCache[GH->levels];
+  sync_cache_cor = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
+  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
+}
 
 //================================================================================================
 
@@ -183,572 +186,577 @@ Z4c_class::~Z4c_class()
 
 #define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
 
-#ifndef CPBC
-// for sommerfeld boundary
-
-#if USE_CUDA_Z4C && (ABEtype == 2)
-#ifdef WithShell
-#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only"
-#endif
-#if (MRBD == 2)
-#error "USE_CUDA_Z4C resident path does not support MRBD == 2"
-#endif
-
-namespace {
-
-static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
-
-bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
-                         double **host_views,
-                         double *propspeeds = 0,
-                         double *soa_flat = 0)
-{
-  int idx = 0;
-  while (vars && idx < Z4C_CUDA_STATE_COUNT)
-  {
-    host_views[idx] = cg->fgfs[vars->data->sgfn];
-    if (propspeeds)
-      propspeeds[idx] = vars->data->propspeed;
-    if (soa_flat)
-    {
-      soa_flat[3 * idx + 0] = vars->data->SoA[0];
-      soa_flat[3 * idx + 1] = vars->data->SoA[1];
-      soa_flat[3 * idx + 2] = vars->data->SoA[2];
-    }
-    vars = vars->next;
-    ++idx;
-  }
-  return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
-}
-
-void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
-{
-  MyList<Patch> *Pp = PatL;
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
-      {
-        double *state_out[Z4C_CUDA_STATE_COUNT];
-        if (!fill_z4c_cuda_views(cg, vars, state_out))
-        {
-          cout << "CUDA Z4C state list mismatch on resident state download" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
-        {
-          cout << "CUDA Z4C resident state download failed" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        if (release_ctx)
-          z4c_cuda_release_step_ctx(cg);
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-}
-
-bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
-{
-  if (!patch)
-    return false;
-  for (int d = 0; d < dim; d++)
-  {
-    const double h = patch->getdX(d);
-    const double lo = patch->bbox[d] + patch->lli[d] * h;
-    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
-    if (point[d] < lo || point[d] > hi)
-      return false;
-  }
-  return true;
-}
-
-bool z4c_cuda_point_in_block(Patch *patch, Block *block,
-                             const double *point, const double *DH)
-{
-  if (!patch || !block)
-    return false;
-  for (int d = 0; d < dim; d++)
-  {
-    double llb;
-    double uub;
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
-              ? block->bbox[d] + patch->lli[d] * DH[d]
-              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
-    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
-              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
-              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
-#else
-#ifdef Cell
-    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
-              ? block->bbox[d] + patch->lli[d] * DH[d]
-              : block->bbox[d] + ghost_width * DH[d];
-    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
-              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
-              : block->bbox[dim + d] - ghost_width * DH[d];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
-      return false;
-  }
-  return true;
-}
-
-int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
-{
-  if (!coords || n <= ordn)
-    return 0;
-  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
-  int start = cxi - ordn / 2;
-  if (start < 0)
-    start = 0;
-  const int max_start = n - ordn;
-  if (start > max_start)
-    start = max_start;
-  return start;
-}
-
-bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
-                                       int myrank,
-                                       const double *point,
-                                       var *forx, var *fory, var *forz,
-                                       int Symmetry,
-                                       double *shellf)
-{
-  const int ordn = 2 * ghost_width;
-  int owner_rank = -1;
-
-  shellf[0] = shellf[1] = shellf[2] = 0.0;
-
-  MyList<Patch> *PL = PatL;
-  while (PL)
-  {
-    Patch *patch = PL->data;
-    if (!z4c_cuda_patch_contains_point(patch, point))
-    {
-      PL = PL->next;
-      continue;
-    }
-
-    double DH[dim];
-    for (int d = 0; d < dim; d++)
-      DH[d] = patch->getdX(d);
-
-    MyList<Block> *BP = patch->blb;
-    while (BP)
-    {
-      Block *block = BP->data;
-      if (z4c_cuda_point_in_block(patch, block, point, DH))
-      {
-        owner_rank = block->rank;
-        if (myrank == owner_rank)
-        {
-          int interp_ordn = ordn;
-          int interp_sym = Symmetry;
-          double x = point[0];
-          double y = point[1];
-          double z = point[2];
-
-          if (z4c_cuda_has_resident_state(block) &&
-              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
-          {
-            const int sx = ordn;
-            const int sy = ordn;
-            const int sz = ordn;
-            const int region_all = sx * sy * sz;
-            const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
-            const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
-            const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
-            double *packed_fields = new double[3 * region_all];
-            var *vars[3] = {forx, fory, forz};
-            for (int f = 0; f < 3; f++)
-            {
-              if (z4c_cuda_pack_state_region_to_host_buffer(block,
-                                                            k_z4c_cuda_bh_state_indices[f],
-                                                            packed_fields + f * region_all,
-                                                            block->shape,
-                                                            i0, j0, k0,
-                                                            sx, sy, sz) != 0)
-              {
-                delete[] packed_fields;
-                cout << "CUDA Z4C BH tile download failed" << endl;
-                MPI_Abort(MPI_COMM_WORLD, 1);
-              }
-              int tile_shape[3] = {sx, sy, sz};
-              f_global_interp(tile_shape,
-                              block->X[0] + i0,
-                              block->X[1] + j0,
-                              block->X[2] + k0,
-                              packed_fields + f * region_all,
-                              shellf[f],
-                              x, y, z,
-                              interp_ordn,
-                              vars[f]->SoA,
-                              interp_sym);
-            }
-            delete[] packed_fields;
-          }
-          else
-          {
-            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-                            block->fgfs[forx->sgfn], shellf[0],
-                            x, y, z, interp_ordn, forx->SoA, interp_sym);
-            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-                            block->fgfs[fory->sgfn], shellf[1],
-                            x, y, z, interp_ordn, fory->SoA, interp_sym);
-            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-                            block->fgfs[forz->sgfn], shellf[2],
-                            x, y, z, interp_ordn, forz->SoA, interp_sym);
-          }
-        }
-        break;
-      }
-      if (BP == patch->ble)
-        break;
-      BP = BP->next;
-    }
-
-    if (owner_rank >= 0)
-      break;
-    PL = PL->next;
-  }
-
-  if (owner_rank < 0)
-    return false;
-
-  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
-  return true;
-}
-
-bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
-                                        int ilev,
-                                        int myrank,
-                                        int BH_num,
-                                        double **BH_PS,
-                                        double **BH_RHS,
-                                        var *forx, var *fory, var *forz,
-                                        int Symmetry)
-{
-  for (int n = 0; n < BH_num; n++)
-  {
-    double shellf[3] = {0.0, 0.0, 0.0};
-    int lev = ilev;
-    while (lev >= 0 &&
-           !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
-                                              forx, fory, forz, Symmetry, shellf))
-    {
-      --lev;
-    }
-    if (lev < 0)
-      return false;
-    BH_RHS[n][0] = -shellf[0];
-    BH_RHS[n][1] = -shellf[1];
-    BH_RHS[n][2] = -shellf[2];
-  }
-  return true;
-}
-
-bool z4c_cuda_resident_step_enabled()
-{
-  static int enabled = -1;
-  if (enabled < 0)
-  {
-    const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT");
-    enabled = (env && atoi(env) != 0) ? 1 : 0;
-  }
-  return enabled != 0;
-}
-
-} // namespace
-#endif
-
-void Z4c_class::Step(int lev, int YN)
-{
-#if USE_CUDA_Z4C && (ABEtype == 2)
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
-#ifdef With_AHF
-  AH_Step_Find(lev, dT_lev);
-#endif
-  bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
-  double ndeps = numepss;
-  if (lev < GH->movls)
-    ndeps = numepsb;
-  double TRK4 = PhysTime;
-  int iter_count = 0;
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank)
-      {
-        double *state_in[Z4C_CUDA_STATE_COUNT];
-        double *state_out[Z4C_CUDA_STATE_COUNT];
-        double propspeed[Z4C_CUDA_STATE_COUNT];
-        double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
-        if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
-            !fill_z4c_cuda_views(cg, SynchList_pre, state_out))
-        {
-          cout << "CUDA Z4C state list mismatch on predictor step" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        int apply_bam_bc = 0;
-#if (MRBD == 0)
-#if (SommerType == 0)
-        apply_bam_bc = (lev == 0) ? 1 : 0;
-#endif
-#elif (MRBD == 1)
-        apply_bam_bc = 1;
-#endif
-        int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
-        int apply_enforce_ga = 0;
-#if (AGM == 0)
-        apply_enforce_ga = 1;
-#endif
-        if (z4c_cuda_rk4_substep(cg,
-                                 cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                 state_in, state_out,
-                                 propspeed, soa_flat, Pp->data->bbox,
-                                 dT_lev, TRK4, iter_count, apply_bam_bc,
-                                 Symmetry, lev, ndeps, pre,
-                                 keep_resident_state, apply_enforce_ga, chitiny))
-        {
-          cout << "CUDA Z4C predictor substep failed in domain: ("
-               << cg->bbox[0] << ":" << cg->bbox[3] << ","
-               << cg->bbox[1] << ":" << cg->bbox[4] << ","
-               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-          ERROR = 1;
-        }
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  {
-    int erh = ERROR;
-    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  }
-  if (ERROR)
-  {
-    if (myrank == 0 && ErrorMonitor->outfile)
-      ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
-                            << ", lev = " << lev << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
-
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-      if (Symmetry > 0)
-        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
-        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
-      }
-    }
-  }
-
-  if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
-    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
-  if (lev == a_lev)
-    AnalysisStuff(lev, dT_lev);
-
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
-    if (iter_count == 1 || iter_count == 3)
-      TRK4 += dT_lev / 2;
-    Pp = GH->PatL[lev];
-    while (Pp)
-    {
-      MyList<Block> *BP = Pp->data->blb;
-      while (BP)
-      {
-        Block *cg = BP->data;
-        if (myrank == cg->rank)
-        {
-          double *state_in[Z4C_CUDA_STATE_COUNT];
-          double *state_out[Z4C_CUDA_STATE_COUNT];
-          double propspeed[Z4C_CUDA_STATE_COUNT];
-          double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
-          if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
-              !fill_z4c_cuda_views(cg, SynchList_cor, state_out))
-          {
-            cout << "CUDA Z4C state list mismatch on corrector step" << endl;
-            MPI_Abort(MPI_COMM_WORLD, 1);
-          }
-          int apply_bam_bc = 0;
-#if (MRBD == 0)
-#if (SommerType == 0)
-          apply_bam_bc = (lev == 0) ? 1 : 0;
-#endif
-#elif (MRBD == 1)
-          apply_bam_bc = 1;
-#endif
-          int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
-          int apply_enforce_ga = 0;
-#if (AGM == 0)
-          apply_enforce_ga = 1;
-#elif (AGM == 1)
-          apply_enforce_ga = (iter_count == 3) ? 1 : 0;
-#endif
-          if (z4c_cuda_rk4_substep(cg,
-                                   cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                   state_in, state_out,
-                                   propspeed, soa_flat, Pp->data->bbox,
-                                   dT_lev, TRK4, iter_count, apply_bam_bc,
-                                   Symmetry, lev, ndeps, cor,
-                                   keep_resident_state, apply_enforce_ga, chitiny))
-          {
-            cout << "CUDA Z4C corrector substep failed in domain: ("
-                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-        }
-        if (BP == Pp->data->ble)
-          break;
-        BP = BP->next;
-      }
-      Pp = Pp->next;
-    }
-
-    {
-      int erh = ERROR;
-      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    }
-    if (ERROR)
-    {
-      if (myrank == 0 && ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
-                              << " at t = " << PhysTime
-                              << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
-
-    if (BH_num > 0 && lev == GH->levels - 1)
-    {
-      if (z4c_cuda_resident_step_enabled())
-      {
-        if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
-                                                Porg, Porg1,
-                                                Sfx, Sfy, Sfz, Symmetry))
-        {
-          if (myrank == 0 && ErrorMonitor->outfile)
-            ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
-                                  << PhysTime << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-      }
-      else
-      {
-        compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
-      }
-      for (int ithBH = 0; ithBH < BH_num; ithBH++)
-      {
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-        if (Symmetry > 0)
-          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
-        if (Symmetry == 2)
-        {
-          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
-          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
-        }
-      }
-    }
-
-    if (iter_count < 3)
-    {
-      Pp = GH->PatL[lev];
-      while (Pp)
-      {
-        MyList<Block> *BP = Pp->data->blb;
-        while (BP)
-        {
-          Block *cg = BP->data;
-          cg->swapList(SynchList_pre, SynchList_cor, myrank);
-          if (BP == Pp->data->ble)
-            break;
-          BP = BP->next;
-        }
-        Pp = Pp->next;
-      }
-      if (BH_num > 0 && lev == GH->levels - 1)
-      {
-        for (int ithBH = 0; ithBH < BH_num; ithBH++)
-        {
-          Porg[ithBH][0] = Porg1[ithBH][0];
-          Porg[ithBH][1] = Porg1[ithBH][1];
-          Porg[ithBH][2] = Porg1[ithBH][2];
-        }
-      }
-    }
-  }
-
-  z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false);
-
-#if (RPS == 0)
-  RestrictProlong(lev, YN, BB);
-#endif
-
-  Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      cg->swapList(StateList, SynchList_cor, myrank);
-      cg->swapList(OldStateList, SynchList_cor, myrank);
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      Porg0[ithBH][0] = Porg1[ithBH][0];
-      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-#else
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+#ifndef CPBC
+// for sommerfeld boundary
+
+#if USE_CUDA_Z4C && (ABEtype == 2)
+#if (MRBD == 2)
+#error "USE_CUDA_Z4C resident path does not support MRBD == 2"
+#endif
+
+namespace {
+
+static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
+
+bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
+                         double **host_views,
+                         double *propspeeds = 0,
+                         double *soa_flat = 0)
+{
+  int idx = 0;
+  while (vars && idx < Z4C_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    if (propspeeds)
+      propspeeds[idx] = vars->data->propspeed;
+    if (soa_flat)
+    {
+      soa_flat[3 * idx + 0] = vars->data->SoA[0];
+      soa_flat[3 * idx + 1] = vars->data->SoA[1];
+      soa_flat[3 * idx + 2] = vars->data->SoA[2];
+    }
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
+}
+
+void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
+      {
+        double *state_out[Z4C_CUDA_STATE_COUNT];
+        if (!fill_z4c_cuda_views(cg, vars, state_out))
+        {
+          cout << "CUDA Z4C state list mismatch on resident state download" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
+        {
+          cout << "CUDA Z4C resident state download failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (release_ctx)
+          z4c_cuda_release_step_ctx(cg);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
+bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
+{
+  if (!patch)
+    return false;
+  for (int d = 0; d < dim; d++)
+  {
+    const double h = patch->getdX(d);
+    const double lo = patch->bbox[d] + patch->lli[d] * h;
+    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
+    if (point[d] < lo || point[d] > hi)
+      return false;
+  }
+  return true;
+}
+
+bool z4c_cuda_point_in_block(Patch *patch, Block *block,
+                             const double *point, const double *DH)
+{
+  if (!patch || !block)
+    return false;
+  for (int d = 0; d < dim; d++)
+  {
+    double llb;
+    double uub;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
+              ? block->bbox[d] + patch->lli[d] * DH[d]
+              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
+    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
+              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
+              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
+#else
+#ifdef Cell
+    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
+              ? block->bbox[d] + patch->lli[d] * DH[d]
+              : block->bbox[d] + ghost_width * DH[d];
+    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
+              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
+              : block->bbox[dim + d] - ghost_width * DH[d];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
+      return false;
+  }
+  return true;
+}
+
+int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
+{
+  if (!coords || n <= ordn)
+    return 0;
+  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
+  int start = cxi - ordn / 2;
+  if (start < 0)
+    start = 0;
+  const int max_start = n - ordn;
+  if (start > max_start)
+    start = max_start;
+  return start;
+}
+
+bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
+                                       int myrank,
+                                       const double *point,
+                                       var *forx, var *fory, var *forz,
+                                       int Symmetry,
+                                       double *shellf)
+{
+  const int ordn = 2 * ghost_width;
+  int owner_rank = -1;
+
+  shellf[0] = shellf[1] = shellf[2] = 0.0;
+
+  MyList<Patch> *PL = PatL;
+  while (PL)
+  {
+    Patch *patch = PL->data;
+    if (!z4c_cuda_patch_contains_point(patch, point))
+    {
+      PL = PL->next;
+      continue;
+    }
+
+    double DH[dim];
+    for (int d = 0; d < dim; d++)
+      DH[d] = patch->getdX(d);
+
+    MyList<Block> *BP = patch->blb;
+    while (BP)
+    {
+      Block *block = BP->data;
+      if (z4c_cuda_point_in_block(patch, block, point, DH))
+      {
+        owner_rank = block->rank;
+        if (myrank == owner_rank)
+        {
+          int interp_ordn = ordn;
+          int interp_sym = Symmetry;
+          double x = point[0];
+          double y = point[1];
+          double z = point[2];
+
+          if (z4c_cuda_has_resident_state(block) &&
+              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
+          {
+            const int sx = ordn;
+            const int sy = ordn;
+            const int sz = ordn;
+            const int region_all = sx * sy * sz;
+            const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
+            const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
+            const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
+            double *packed_fields = new double[3 * region_all];
+            var *vars[3] = {forx, fory, forz};
+            for (int f = 0; f < 3; f++)
+            {
+              if (z4c_cuda_pack_state_region_to_host_buffer(block,
+                                                            k_z4c_cuda_bh_state_indices[f],
+                                                            packed_fields + f * region_all,
+                                                            block->shape,
+                                                            i0, j0, k0,
+                                                            sx, sy, sz) != 0)
+              {
+                delete[] packed_fields;
+                cout << "CUDA Z4C BH tile download failed" << endl;
+                MPI_Abort(MPI_COMM_WORLD, 1);
+              }
+              int tile_shape[3] = {sx, sy, sz};
+              f_global_interp(tile_shape,
+                              block->X[0] + i0,
+                              block->X[1] + j0,
+                              block->X[2] + k0,
+                              packed_fields + f * region_all,
+                              shellf[f],
+                              x, y, z,
+                              interp_ordn,
+                              vars[f]->SoA,
+                              interp_sym);
+            }
+            delete[] packed_fields;
+          }
+          else
+          {
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[forx->sgfn], shellf[0],
+                            x, y, z, interp_ordn, forx->SoA, interp_sym);
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[fory->sgfn], shellf[1],
+                            x, y, z, interp_ordn, fory->SoA, interp_sym);
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[forz->sgfn], shellf[2],
+                            x, y, z, interp_ordn, forz->SoA, interp_sym);
+          }
+        }
+        break;
+      }
+      if (BP == patch->ble)
+        break;
+      BP = BP->next;
+    }
+
+    if (owner_rank >= 0)
+      break;
+    PL = PL->next;
+  }
+
+  if (owner_rank < 0)
+    return false;
+
+  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
+  return true;
+}
+
+bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
+                                        int ilev,
+                                        int myrank,
+                                        int BH_num,
+                                        double **BH_PS,
+                                        double **BH_RHS,
+                                        var *forx, var *fory, var *forz,
+                                        int Symmetry)
+{
+  for (int n = 0; n < BH_num; n++)
+  {
+    double shellf[3] = {0.0, 0.0, 0.0};
+    int lev = ilev;
+    while (lev >= 0 &&
+           !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
+                                              forx, fory, forz, Symmetry, shellf))
+    {
+      --lev;
+    }
+    if (lev < 0)
+      return false;
+    BH_RHS[n][0] = -shellf[0];
+    BH_RHS[n][1] = -shellf[1];
+    BH_RHS[n][2] = -shellf[2];
+  }
+  return true;
+}
+
+bool z4c_cuda_resident_step_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+} // namespace
+#endif
+
+void Z4c_class::Step(int lev, int YN)
+{
+#if USE_CUDA_Z4C && (ABEtype == 2)
+  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+#ifdef With_AHF
+  AH_Step_Find(lev, dT_lev);
+#endif
+  bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
+  double ndeps = numepss;
+  if (lev < GH->movls)
+    ndeps = numepsb;
+  double TRK4 = PhysTime;
+  int iter_count = 0;
+  int pre = 0, cor = 1;
+  int ERROR = 0;
+
+#ifdef WithShell
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
+
+  MyList<Patch> *Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        double *state_in[Z4C_CUDA_STATE_COUNT];
+        double *state_out[Z4C_CUDA_STATE_COUNT];
+        double propspeed[Z4C_CUDA_STATE_COUNT];
+        double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
+        if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
+            !fill_z4c_cuda_views(cg, SynchList_pre, state_out))
+        {
+          cout << "CUDA Z4C state list mismatch on predictor step" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        int apply_bam_bc = 0;
+#if (MRBD == 0)
+#if (SommerType == 0)
+        apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#elif (MRBD == 1)
+        apply_bam_bc = 1;
+#endif
+        int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
+        int apply_enforce_ga = 0;
+#if (AGM == 0)
+        apply_enforce_ga = 1;
+#endif
+        if (z4c_cuda_rk4_substep(cg,
+                                 cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                 state_in, state_out,
+                                 propspeed, soa_flat, Pp->data->bbox,
+                                 dT_lev, TRK4, iter_count, apply_bam_bc,
+                                 Symmetry, lev, ndeps, pre,
+                                 keep_resident_state, apply_enforce_ga, chitiny))
+        {
+          cout << "CUDA Z4C predictor substep failed in domain: ("
+               << cg->bbox[0] << ":" << cg->bbox[3] << ","
+               << cg->bbox[1] << ":" << cg->bbox[4] << ","
+               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+          ERROR = 1;
+        }
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+
+  {
+    int erh = ERROR;
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  }
+  if (ERROR)
+  {
+    if (myrank == 0 && ErrorMonitor->outfile)
+      ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
+                            << ", lev = " << lev << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
+      if (Symmetry > 0)
+        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
+      if (Symmetry == 2)
+      {
+        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
+        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
+      }
+    }
+  }
+
+  if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
+    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
+  if (lev == a_lev)
+    AnalysisStuff(lev, dT_lev);
+
+  for (iter_count = 1; iter_count < 4; iter_count++)
+  {
+    if (iter_count == 1 || iter_count == 3)
+      TRK4 += dT_lev / 2;
+    Pp = GH->PatL[lev];
+    while (Pp)
+    {
+      MyList<Block> *BP = Pp->data->blb;
+      while (BP)
+      {
+        Block *cg = BP->data;
+        if (myrank == cg->rank)
+        {
+          double *state_in[Z4C_CUDA_STATE_COUNT];
+          double *state_out[Z4C_CUDA_STATE_COUNT];
+          double propspeed[Z4C_CUDA_STATE_COUNT];
+          double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
+          if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
+              !fill_z4c_cuda_views(cg, SynchList_cor, state_out))
+          {
+            cout << "CUDA Z4C state list mismatch on corrector step" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          int apply_bam_bc = 0;
+#if (MRBD == 0)
+#if (SommerType == 0)
+          apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#elif (MRBD == 1)
+          apply_bam_bc = 1;
+#endif
+          int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
+          int apply_enforce_ga = 0;
+#if (AGM == 0)
+          apply_enforce_ga = 1;
+#elif (AGM == 1)
+          apply_enforce_ga = (iter_count == 3) ? 1 : 0;
+#endif
+          if (z4c_cuda_rk4_substep(cg,
+                                   cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                   state_in, state_out,
+                                   propspeed, soa_flat, Pp->data->bbox,
+                                   dT_lev, TRK4, iter_count, apply_bam_bc,
+                                   Symmetry, lev, ndeps, cor,
+                                   keep_resident_state, apply_enforce_ga, chitiny))
+          {
+            cout << "CUDA Z4C corrector substep failed in domain: ("
+                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+        }
+        if (BP == Pp->data->ble)
+          break;
+        BP = BP->next;
+      }
+      Pp = Pp->next;
+    }
+
+    {
+      int erh = ERROR;
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    }
+    if (ERROR)
+    {
+      if (myrank == 0 && ErrorMonitor->outfile)
+        ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
+                              << " at t = " << PhysTime
+                              << ", lev = " << lev << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+
+    if (BH_num > 0 && lev == GH->levels - 1)
+    {
+      if (z4c_cuda_resident_step_enabled())
+      {
+        if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
+                                                Porg, Porg1,
+                                                Sfx, Sfy, Sfz, Symmetry))
+        {
+          if (myrank == 0 && ErrorMonitor->outfile)
+            ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
+                                  << PhysTime << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+      else
+      {
+        compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
+      }
+      for (int ithBH = 0; ithBH < BH_num; ithBH++)
+      {
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
+        if (Symmetry > 0)
+          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
+        if (Symmetry == 2)
+        {
+          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
+          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
+        }
+      }
+    }
+
+    if (iter_count < 3)
+    {
+      Pp = GH->PatL[lev];
+      while (Pp)
+      {
+        MyList<Block> *BP = Pp->data->blb;
+        while (BP)
+        {
+          Block *cg = BP->data;
+          cg->swapList(SynchList_pre, SynchList_cor, myrank);
+          if (BP == Pp->data->ble)
+            break;
+          BP = BP->next;
+        }
+        Pp = Pp->next;
+      }
+      if (BH_num > 0 && lev == GH->levels - 1)
+      {
+        for (int ithBH = 0; ithBH < BH_num; ithBH++)
+        {
+          Porg[ithBH][0] = Porg1[ithBH][0];
+          Porg[ithBH][1] = Porg1[ithBH][1];
+          Porg[ithBH][2] = Porg1[ithBH][2];
+        }
+      }
+    }
+  }
+
+  z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false);
+
+#if (RPS == 0)
+  RestrictProlong(lev, YN, BB);
+#endif
+
+  Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      cg->swapList(StateList, SynchList_cor, myrank);
+      cg->swapList(OldStateList, SynchList_cor, myrank);
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      Porg0[ithBH][0] = Porg1[ithBH][0];
+      Porg0[ithBH][1] = Porg1[ithBH][1];
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+#else
+  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
   AH_Step_Find(lev, dT_lev);
 #endif
@@ -915,6 +923,13 @@ void Z4c_class::Step(int lev, int YN)
   }
 
 #ifdef WithShell
+#if USE_CUDA_Z4C
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
   // evolve Shell Patches
   if (lev == 0)
   {
@@ -1615,19 +1630,17 @@ void Z4c_class::Step(int lev, int YN)
     {
       Porg0[ithBH][0] = Porg1[ithBH][0];
       Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-#endif
-}
-#else
-// for constraint preserving boundary (CPBC)
-#if USE_CUDA_Z4C && (ABEtype == 2)
-#error "USE_CUDA_Z4C resident path does not support CPBC"
-#endif
-#ifndef WithShell
-#error "CPBC only supports Shell"
-#endif
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+#endif
+}
+#else
+// for constraint preserving boundary (CPBC)
+// Note: CPBC path uses CPU Fortran RHS; GPU resident sync is a no-op here.
+#ifndef WithShell
+#error "CPBC only supports Shell"
+#endif
 
 // 0: extroplate rhs, 1: extroplate variable
 // 2: extroplate variable but before RHS calculation
@@ -1654,6 +1667,14 @@ void Z4c_class::Step(int lev, int YN)
   int pre = 0, cor = 1;
   int ERROR = 0;
 
+#if USE_CUDA_Z4C && defined(WithShell)
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
+
   MyList<ss_patch> *sPp;
   // Predictor
   MyList<Patch> *Pp = GH->PatL[lev];
@@ -3258,13 +3279,13 @@ void Z4c_class::Interp_Constraint()
     }
   }
 
-  ofstream outfile;
-  char suffix[64];
-  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
-  string filename = ErrorMonitor->out_dir + suffix;
-  // 0.5 for round off
-  
-  outfile.open(filename.c_str());
+  ofstream outfile;
+  char suffix[64];
+  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
+  string filename = ErrorMonitor->out_dir + suffix;
+  // 0.5 for round off
+  
+  outfile.open(filename.c_str());
   outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl;
   for (int i = 0; i < n; i++)
   {
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index 241cbf1..90ddca7 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -548,6 +548,8 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
   return idx == state_count && vars == 0;
 }
 
+} // namespace
+
 bool bssn_cuda_use_resident_sync(int lev)
 {
   (void)lev;
@@ -1032,7 +1034,6 @@ void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
   }
 }
 
-} // namespace
 #endif
 
 #if !USE_CUDA_BSSN
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 9995bb5..27c7953 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -404,6 +404,10 @@ void bssn_cuda_release_step_ctx(void *block_tag);
 
 #ifdef __cplusplus
 }
+// C++-only helpers declared for derived equation classes (Z4C, etc.)
+// Defined in bssn_class.C. Requires MyList, Patch, var from including TU.
+bool bssn_cuda_use_resident_sync(int lev);
+void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank);
 #endif
 
 #endif