diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C
index 8de5829..48ab025 100644
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -194,7 +194,7 @@ bool bssn_escalar_cuda_bh_interp_resident_enabled()
   if (enabled < 0)
   {
     const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
-    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
+    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 0;
   }
   return enabled != 0;
 }
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index e70584e..2b88c67 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -1021,7 +1021,9 @@ void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
     while (BP)
     {
       Block *cg = BP->data;
-      if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
+      if (myrank == cg->rank &&
+          bssn_cuda_has_resident_state(cg) &&
+          !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
       {
         cout << "CUDA BH state subset download failed" << endl;
         MPI_Abort(MPI_COMM_WORLD, 1);
@@ -1058,7 +1060,7 @@ bool bssn_cuda_bh_interp_resident_enabled()
     if (env)
       enabled = (atoi(env) != 0) ? 1 : 0;
     else
-      enabled = 1;
+      enabled = 0;
   }
   return enabled != 0;
 }
@@ -8589,6 +8591,23 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
 {
   const int InList = 3;
 
+#if USE_CUDA_BSSN
+  const bool use_resident_bh_interp = bssn_cuda_bh_interp_resident_enabled();
+  if (!use_resident_bh_interp && bssn_cuda_use_resident_sync(ilev))
+  {
+    MyList<var> *host_state_list = 0;
+    if (forx == Sfx0 && fory == Sfy0 && forz == Sfz0)
+      host_state_list = StateList;
+    else if (forx == Sfx && fory == Sfy && forz == Sfz)
+      host_state_list = SynchList_pre;
+    else if (forx == Sfx1 && fory == Sfy1 && forz == Sfz1)
+      host_state_list = SynchList_cor;
+
+    if (host_state_list)
+      bssn_cuda_download_level_state_if_present(GH->PatL[ilev], host_state_list, myrank);
+  }
+#endif
+
   MyList<var> *DG_List = new MyList<var>(forx);
   DG_List->insert(fory);
   DG_List->insert(forz);
@@ -8609,7 +8628,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
     int lev = ilev;
 
 #if USE_CUDA_BSSN
-    if (bssn_cuda_bh_interp_resident_enabled() &&
+    if (use_resident_bh_interp &&
         bssn_cuda_use_resident_sync(lev) &&
         bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
     {
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 6313e5f..1b0c4d5 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -7693,15 +7693,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
 {
     double s = 1.0;
     if (x < 0) {
-        x = -x;
+        x = -x - 1;
         s *= d_comm_state_soa[3 * state_index + 0];
     }
     if (y < 0) {
-        y = -y;
+        y = -y - 1;
         s *= d_comm_state_soa[3 * state_index + 1];
     }
     if (z < 0) {
-        z = -z;
+        z = -z - 1;
         s *= d_comm_state_soa[3 * state_index + 2];
     }
     const int src = x + y * nx + z * nx * ny;
diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile
index b3f6914..fee25b0 100644
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -18,9 +18,9 @@ OMP_FLAG = -qopenmp
 
 ifeq ($(PGO_MODE),instrument)
 ## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
-CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+CXXAPPFLAGS = -O3 -march=znver5  -fma -fprofile-instr-generate -ipo \
               -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+f90appflags = -O3 -march=znver5  -fma -fprofile-instr-generate -ipo \
               -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
 else
 ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
@@ -28,23 +28,23 @@ else
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 
 
-CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+CXXAPPFLAGS = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
               -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
+f90appflags = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
               -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
 endif
 
-TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+TP_OPTFLAGS = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
               -Dfortran3 -Dnewc $(MKL_INC)
 else
 ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
 ## PGO_MODE is ignored in this branch.
 OMP_FLAG = -mp
-CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
+CXXAPPFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
               -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
+f90appflags = -O3 -march=znver5 -tp=host -Mcache_align -Mfma -Mpreprocess \
               $(MKL_INC) $(POLINT6_FLAG)
-TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
+TP_OPTFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
               -Dfortran3 -Dnewc $(MKL_INC)
 endif
 
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu
index 4e5b3c9..1e06ee2 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -5206,15 +5206,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
 {
     double s = 1.0;
     if (x < 0) {
-        x = -x;
+        x = -x - 1;
         s *= d_comm_state_soa[3 * state_index + 0];
     }
     if (y < 0) {
-        y = -y;
+        y = -y - 1;
         s *= d_comm_state_soa[3 * state_index + 1];
     }
     if (z < 0) {
-        z = -z;
+        z = -z - 1;
         s *= d_comm_state_soa[3 * state_index + 2];
     }
     const int src = x + y * nx + z * nx * ny;