diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index 7592cea..ecbac7a 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -95,7 +95,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
     while (BP)
     {
       Block *cg = BP->data;
-      if (myrank == cg->rank)
+      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
       {
         double *state_out[BSSN_CUDA_STATE_COUNT];
         if (!fill_bssn_cuda_views(cg, vars, state_out))
@@ -3301,18 +3301,21 @@ void bssn_class::Step(int lev, int YN)
                                cg->fgfs[varlrhs->data->sgfn],
                                iter_count);
 #endif
+            if (!used_gpu_substep)
+            {
 #ifndef WithShell
-            if (lev > 0) // fix BD point
+              if (lev > 0) // fix BD point
 #endif
-              f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                dT_lev, 
-                                cg->fgfs[phi0->sgfn],
-                                cg->fgfs[Lap0->sgfn], 
-                                cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
-                                varl0->data->SoA,
-                                Symmetry, cor);
+                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                  dT_lev, 
+                                  cg->fgfs[phi0->sgfn],
+                                  cg->fgfs[Lap0->sgfn], 
+                                  cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
+                                  varl0->data->SoA,
+                                  Symmetry, cor);
+            }
 
 #if (SommerType == 1)
 #warning "shell part still bam type"
@@ -3333,7 +3336,7 @@ void bssn_class::Step(int lev, int YN)
             varlrhs = varlrhs->next;
           }
         }
-        if (!used_gpu_resident_state)
+        if (!used_gpu_substep)
           f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
       }
       if (BP == Pp->data->ble)
@@ -3722,18 +3725,21 @@ void bssn_class::Step(int lev, int YN)
                                  iter_count);
 #endif
 
+              if (!used_gpu_substep)
+              {
 #ifndef WithShell
-              if (lev > 0) // fix BD point
+                if (lev > 0) // fix BD point
 #endif
-                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                  dT_lev, 
-                                  cg->fgfs[phi0->sgfn],
-                                  cg->fgfs[Lap0->sgfn], 
-                                  cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], 
-                                  varl0->data->SoA,
-                                  Symmetry, cor);
+                  f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                    Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                    Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                    dT_lev, 
+                                    cg->fgfs[phi0->sgfn],
+                                    cg->fgfs[Lap0->sgfn], 
+                                    cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], 
+                                    varl0->data->SoA,
+                                    Symmetry, cor);
+              }
 
 #if (SommerType == 1)
               if (lev == 1) // shibata type sommerfeld
@@ -3754,7 +3760,7 @@ void bssn_class::Step(int lev, int YN)
               varlrhs = varlrhs->next;
             }
           }
-          if (!used_gpu_resident_state)
+          if (!used_gpu_substep)
             f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
         }
         if (BP == Pp->data->ble)
@@ -8636,4 +8642,3 @@ bool bssn_class::check_Stdin_Abort()
 }
 
 //================================================================================================
-
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index f709e48..abcad51 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -1216,6 +1216,11 @@ struct Rk4FinalizeTables {
     double *accum_fields[BSSN_STATE_COUNT];
 };
 
+struct PatchBoundaryTables {
+    const double *src_fields[BSSN_STATE_COUNT];
+    double *dst_fields[BSSN_STATE_COUNT];
+};
+
 static const int BLK = 128;
 static inline int grid(size_t n) {
     if (n == 0) return 1;
@@ -1591,6 +1596,56 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx,
     kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
 }
 
+__global__ __launch_bounds__(128, 4)
+void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
+                                      int touch_xmin, int touch_xmax,
+                                      int touch_ymin, int touch_ymax,
+                                      int touch_zmin, int touch_zmax)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_gp.all) return;
+
+    const int nx = d_gp.ex[0];
+    const int ny = d_gp.ex[1];
+    const int i0 = tid % nx;
+    const int j0 = (tid / nx) % ny;
+    const int k0 = tid / (nx * ny);
+
+    const bool on_boundary =
+        (touch_xmin && i0 == 0) ||
+        (touch_xmax && i0 == nx - 1) ||
+        (touch_ymin && j0 == 0) ||
+        (touch_ymax && j0 == ny - 1) ||
+        (touch_zmin && k0 == 0) ||
+        (touch_zmax && k0 == d_gp.ex[2] - 1);
+    if (!on_boundary) return;
+
+    const int field = blockIdx.y;
+    tables.dst_fields[field][tid] = tables.src_fields[field][tid];
+}
+
+static void gpu_copy_patch_boundary_batch(int all,
+                                          int touch_xmin, int touch_xmax,
+                                          int touch_ymin, int touch_ymax,
+                                          int touch_zmin, int touch_zmax)
+{
+    if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax))
+        return;
+
+    PatchBoundaryTables tables = {};
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]];
+        tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
+    }
+
+    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT);
+    kern_copy_patch_boundary_batched<<<launch_grid, BLK>>>(
+        tables,
+        touch_xmin, touch_xmax,
+        touch_ymin, touch_ymax,
+        touch_zmin, touch_zmax);
+}
+
 __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                      double * __restrict__ gxy,
                                      double * __restrict__ gxz,
@@ -3008,6 +3063,32 @@ static void setup_grid_params(int *ex,
     upload_grid_params_if_needed(gp);
 }
 
+static void compute_patch_boundary_flags(int *ex,
+                                         double *X, double *Y, double *Z,
+                                         const double *bbox,
+                                         int Symmetry,
+                                         int &touch_xmin, int &touch_xmax,
+                                         int &touch_ymin, int &touch_ymax,
+                                         int &touch_zmin, int &touch_zmax)
+{
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+    const int NO_SYMM = 0;
+    const int OCTANT = 2;
+
+    touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0;
+    touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0;
+    touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0;
+
+    touch_xmin = (std::fabs(X[0] - bbox[0]) < dX &&
+                  !(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0;
+    touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY &&
+                  !(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0;
+    touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ &&
+                  !(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0;
+}
+
 static void upload_state_inputs(double **state_host, size_t all)
 {
     const size_t bytes = all * sizeof(double);
@@ -3922,8 +4003,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
 
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
+    int touch_xmin = 0, touch_xmax = 0;
+    int touch_ymin = 0, touch_ymax = 0;
+    int touch_zmin = 0, touch_zmax = 0;
 
     setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    if (Lev > 0) {
+        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
+                                     touch_xmin, touch_xmax,
+                                     touch_ymin, touch_ymax,
+                                     touch_zmin, touch_zmax);
+    }
     StepContext &ctx = ensure_step_ctx(block_tag, all);
     const bool use_resident_state = (keep_resident_state != 0);
     if (use_resident_state) {
@@ -3986,6 +4076,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
 
     t0 = profile ? cuda_profile_now_ms() : 0.0;
     gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
+    if (Lev > 0) {
+        gpu_copy_patch_boundary_batch((int)all,
+                                      touch_xmin, touch_xmax,
+                                      touch_ymin, touch_ymax,
+                                      touch_zmin, touch_zmax);
+    }
     if (profile) {
         cuda_profile_sync();
         finalize_ms += cuda_profile_now_ms() - t0;