diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 311f4be..239574e 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
     return false;
   if (z4c_cuda_has_resident_state(dst->Bg) == 0)
     return false;
-  if (type != 1 && VarListd)
-  {
-    double *view_ptrs[Z4C_CUDA_STATE_COUNT];
-    if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
-      return false;
-    if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
-      return false;
-  }
+  (void)VarListd;
   return true;
 #elif USE_CUDA_BSSN
   return bssn_cuda_has_resident_state(dst->Bg) != 0;
@@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer,
   const int j0 = cuda_seg_begin(dst, src->Bg, 1);
   const int k0 = cuda_seg_begin(dst, src->Bg, 2);
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
-                                                           i0, j0, k0,
-                                                           dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  double *views[Z4C_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
+  const bool ok = have_views
+                      ? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(
+                            src->Bg, views, state_count, buffer, src->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                      : z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
+                                                                 i0, j0, k0,
+                                                                 dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
   double *views[BSSN_CUDA_STATE_COUNT];
   const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
@@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer,
   const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
   const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
-                                                               i0, j0, k0,
-                                                               dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  double *views[Z4C_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
+  const bool ok = have_views
+                      ? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(
+                            dst->Bg, views, state_count, buffer, dst->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                      : z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
+                                                                     i0, j0, k0,
+                                                                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
   double *views[BSSN_CUDA_STATE_COUNT];
   const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
@@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
   {
     const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
     bool ok = false;
+    double *views[Z4C_CUDA_STATE_COUNT];
     double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
+    const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
     const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
     if (type == 1)
     {
       const int i0 = cuda_seg_begin(dst, src->Bg, 0);
       const int j0 = cuda_seg_begin(dst, src->Bg, 1);
       const int k0 = cuda_seg_begin(dst, src->Bg, 2);
-      ok = z4c_cuda_pack_state_batch_to_device_buffer(
-               src->Bg, state_count, buffer, src->Bg->shape,
-               i0, j0, k0,
-               dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+      ok = have_views
+               ? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
+                     i0, j0, k0,
+                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+               : z4c_cuda_pack_state_batch_to_device_buffer(
+                     src->Bg, state_count, buffer, src->Bg->shape,
+                     i0, j0, k0,
+                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
     }
     else if (type == 2)
     {
       int first_fine[3];
       if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
         return false;
-      ok = z4c_cuda_restrict_state_batch_to_device_buffer(
-               src->Bg, state_count, buffer, src->Bg->shape,
-               dst->shape[0], dst->shape[1], dst->shape[2],
-               first_fine[0], first_fine[1], first_fine[2],
-               have_soa ? soa_flat : 0) == 0;
+      ok = have_views
+               ? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
+                     dst->shape[0], dst->shape[1], dst->shape[2],
+                     first_fine[0], first_fine[1], first_fine[2],
+                     have_soa ? soa_flat : 0) == 0
+               : z4c_cuda_restrict_state_batch_to_device_buffer(
+                     src->Bg, state_count, buffer, src->Bg->shape,
+                     dst->shape[0], dst->shape[1], dst->shape[2],
+                     first_fine[0], first_fine[1], first_fine[2],
+                     have_soa ? soa_flat : 0) == 0;
     }
     else if (type == 3)
     {
       int first_fine_ii[3], coarse_lb[3];
       if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
         return false;
-      ok = z4c_cuda_prolong_state_batch_to_device_buffer(
-               src->Bg, state_count, buffer, src->Bg->shape,
-               dst->shape[0], dst->shape[1], dst->shape[2],
-               first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
-               coarse_lb[0], coarse_lb[1], coarse_lb[2],
-               have_soa ? soa_flat : 0) == 0;
+      ok = have_views
+               ? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
+                     dst->shape[0], dst->shape[1], dst->shape[2],
+                     first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+                     coarse_lb[0], coarse_lb[1], coarse_lb[2],
+                     have_soa ? soa_flat : 0) == 0
+               : z4c_cuda_prolong_state_batch_to_device_buffer(
+                     src->Bg, state_count, buffer, src->Bg->shape,
+                     dst->shape[0], dst->shape[1], dst->shape[2],
+                     first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+                     coarse_lb[0], coarse_lb[1], coarse_lb[2],
+                     have_soa ? soa_flat : 0) == 0;
     }
     if (sync_profile_enabled())
       sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
     const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
     const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
     const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
-    const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
-        dst->Bg, state_count, buffer, dst->Bg->shape,
-        i0, j0, k0,
-        dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    double *views[Z4C_CUDA_STATE_COUNT];
+    const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
+    const bool ok = have_views
+                        ? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(
+                              dst->Bg, views, state_count, buffer, dst->Bg->shape,
+                              i0, j0, k0,
+                              dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                        : z4c_cuda_unpack_state_batch_from_device_buffer(
+                              dst->Bg, state_count, buffer, dst->Bg->shape,
+                              i0, j0, k0,
+                              dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
     if (sync_profile_enabled())
       sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
     return ok;
@@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block,
 #if USE_CUDA_Z4C && (ABEtype == 2)
   if (state_count == Z4C_CUDA_STATE_COUNT)
   {
+    double *views[Z4C_CUDA_STATE_COUNT];
     double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
+    const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views);
     const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
     if (dir == PACK)
     {
       if (type == 2)
-        return z4c_cuda_restrict_state_segments_to_device_buffer(
-                   block, state_count, data, block->shape, segment_count,
-                   meta.data(), have_soa ? soa_flat : 0) == 0;
+        return have_views
+                   ? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(
+                         block, views, state_count, data, block->shape, segment_count,
+                         meta.data(), have_soa ? soa_flat : 0) == 0
+                   : z4c_cuda_restrict_state_segments_to_device_buffer(
+                         block, state_count, data, block->shape, segment_count,
+                         meta.data(), have_soa ? soa_flat : 0) == 0;
       if (type == 3)
-        return z4c_cuda_prolong_state_segments_to_device_buffer(
-                   block, state_count, data, block->shape, segment_count,
-                   meta.data(), have_soa ? soa_flat : 0) == 0;
-      return z4c_cuda_pack_state_segments_to_device_buffer(
-                 block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+        return have_views
+                   ? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(
+                         block, views, state_count, data, block->shape, segment_count,
+                         meta.data(), have_soa ? soa_flat : 0) == 0
+                   : z4c_cuda_prolong_state_segments_to_device_buffer(
+                         block, state_count, data, block->shape, segment_count,
+                         meta.data(), have_soa ? soa_flat : 0) == 0;
+      return have_views
+                 ? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(
+                       block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
+                 : z4c_cuda_pack_state_segments_to_device_buffer(
+                       block, state_count, data, block->shape, segment_count, meta.data()) == 0;
     }
-    return z4c_cuda_unpack_state_segments_from_device_buffer(
-               block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+    return have_views
+               ? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(
+                     block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
+               : z4c_cuda_unpack_state_segments_from_device_buffer(
+                     block, state_count, data, block->shape, segment_count, meta.data()) == 0;
   }
 #endif
 #if USE_CUDA_BSSN
@@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
 	                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
 	                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
 	            }
-	          }
+          }
           if (dir == UNPACK) // from target data to corresponding grid
           {
             f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C
index 57c9772..1c50e72 100644
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -388,41 +388,57 @@ bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
           if (z4c_cuda_has_resident_state(block) &&
               block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
           {
-            const int sx = ordn;
-            const int sy = ordn;
-            const int sz = ordn;
-            const int region_all = sx * sy * sz;
-            const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
-            const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
-            const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
-            double *packed_fields = new double[3 * region_all];
             var *vars[3] = {forx, fory, forz};
-            for (int f = 0; f < 3; f++)
+            static int use_device_bh_interp = -1;
+            if (use_device_bh_interp < 0)
             {
-              if (z4c_cuda_pack_state_region_to_host_buffer(block,
-                                                            k_z4c_cuda_bh_state_indices[f],
-                                                            packed_fields + f * region_all,
-                                                            block->shape,
-                                                            i0, j0, k0,
-                                                            sx, sy, sz) != 0)
+              const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE");
+              use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0;
+            }
+            bool used_device_interp = false;
+            if (use_device_bh_interp)
+            {
+              double soa3[9];
+              for (int f = 0; f < 3; f++)
               {
-                delete[] packed_fields;
-                cout << "CUDA Z4C BH tile download failed" << endl;
+                soa3[3 * f + 0] = vars[f]->SoA[0];
+                soa3[3 * f + 1] = vars[f]->SoA[1];
+                soa3[3 * f + 2] = vars[f]->SoA[2];
+              }
+              used_device_interp =
+                  (z4c_cuda_interp_state_point3(block, block->shape,
+                                                k_z4c_cuda_bh_state_indices[0],
+                                                k_z4c_cuda_bh_state_indices[1],
+                                                k_z4c_cuda_bh_state_indices[2],
+                                                block->X[0][0], block->X[1][0], block->X[2][0],
+                                                DH[0], DH[1], DH[2],
+                                                x, y, z,
+                                                interp_ordn, interp_sym,
+                                                soa3, shellf) == 0);
+            }
+            if (!used_device_interp)
+            {
+              double *shift_views[3] = {
+                  block->fgfs[forx->sgfn],
+                  block->fgfs[fory->sgfn],
+                  block->fgfs[forz->sgfn]};
+              if (z4c_cuda_download_state_subset(block, block->shape, 3,
+                                                 k_z4c_cuda_bh_state_indices,
+                                                 shift_views) != 0)
+              {
+                cout << "CUDA Z4C BH shift download failed" << endl;
                 MPI_Abort(MPI_COMM_WORLD, 1);
               }
-              int tile_shape[3] = {sx, sy, sz};
-              f_global_interp(tile_shape,
-                              block->X[0] + i0,
-                              block->X[1] + j0,
-                              block->X[2] + k0,
-                              packed_fields + f * region_all,
-                              shellf[f],
-                              x, y, z,
-                              interp_ordn,
-                              vars[f]->SoA,
-                              interp_sym);
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                              block->fgfs[forx->sgfn], shellf[0],
+                              x, y, z, interp_ordn, forx->SoA, interp_sym);
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                              block->fgfs[fory->sgfn], shellf[1],
+                              x, y, z, interp_ordn, fory->SoA, interp_sym);
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                              block->fgfs[forz->sgfn], shellf[2],
+                              x, y, z, interp_ordn, forz->SoA, interp_sym);
             }
-            delete[] packed_fields;
           }
           else
           {
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu
index ec25cca..72c47b1 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -378,6 +378,7 @@ static const int STAGE_SLOT_COUNT =
 static constexpr int BSSN_STATE_COUNT = Z4C_CUDA_STATE_COUNT;
 static constexpr int BSSN_MATTER_COUNT = 10;
 static constexpr int BSSN_LK_FIELD_COUNT = 24;
+static constexpr int Z4C_RESIDENT_BANK_COUNT = 4;
 
 static const int k_state_input_slots[BSSN_STATE_COUNT] = {
     S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
@@ -454,6 +455,7 @@ struct StepContext {
     double *d_accum_mem;
     double *d_state_curr_mem;
     double *d_state_next_mem;
+    std::array<double *, Z4C_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
     double *d_comm_mem;
     double *h_comm_mem;
@@ -462,36 +464,49 @@ struct StepContext {
     std::array<double *, BSSN_STATE_COUNT> d_state_curr;
     std::array<double *, BSSN_STATE_COUNT> d_state_next;
     std::array<double *, BSSN_MATTER_COUNT> d_matter;
-    std::array<double *, BSSN_STATE_COUNT> resident_host;
+    std::array<std::array<double *, BSSN_STATE_COUNT>, Z4C_RESIDENT_BANK_COUNT> d_resident;
+    std::array<std::array<double *, BSSN_STATE_COUNT>, Z4C_RESIDENT_BANK_COUNT> resident_host;
+    std::array<std::array<unsigned char, BSSN_STATE_COUNT>, Z4C_RESIDENT_BANK_COUNT> resident_host_clean;
+    std::array<unsigned long long, Z4C_RESIDENT_BANK_COUNT> resident_age;
+    std::array<bool, Z4C_RESIDENT_BANK_COUNT> resident_valid;
     size_t cap_all;
     size_t cap_comm;
     bool h_comm_pinned;
     size_t cap_h_comm;
     bool matter_ready;
     bool state_ready;
-    bool resident_host_valid;
+    int current_bank;
+    unsigned long long resident_clock;
 
     StepContext()
         : d_state0_mem(nullptr), d_accum_mem(nullptr),
           d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
+          d_resident_mem{},
           d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
           cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
-          matter_ready(false), state_ready(false), resident_host_valid(false)
+          matter_ready(false), state_ready(false),
+          current_bank(-1), resident_clock(0)
     {
+        d_resident_mem.fill(nullptr);
         d_state0.fill(nullptr);
         d_accum.fill(nullptr);
         d_state_curr.fill(nullptr);
         d_state_next.fill(nullptr);
         d_matter.fill(nullptr);
-        resident_host.fill(nullptr);
+        for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+            d_resident[b].fill(nullptr);
+            resident_host[b].fill(nullptr);
+            resident_host_clean[b].fill(0);
+        }
+        resident_age.fill(0);
+        resident_valid.fill(false);
     }
 };
 
 struct StepAllocation {
     double *d_state0_mem;
     double *d_accum_mem;
-    double *d_state_curr_mem;
-    double *d_state_next_mem;
+    std::array<double *, Z4C_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
     double *d_comm_mem;
     double *h_comm_mem;
@@ -508,11 +523,17 @@ static size_t g_comm_segment_meta_cap = 0;
 
 static StepAllocation empty_step_allocation()
 {
-    StepAllocation alloc = {
-        nullptr, nullptr, nullptr, nullptr, nullptr,
-        nullptr, nullptr,
-        0, 0, false, 0
-    };
+    StepAllocation alloc = {};
+    alloc.d_state0_mem = nullptr;
+    alloc.d_accum_mem = nullptr;
+    alloc.d_resident_mem.fill(nullptr);
+    alloc.d_matter_mem = nullptr;
+    alloc.d_comm_mem = nullptr;
+    alloc.h_comm_mem = nullptr;
+    alloc.cap_all = 0;
+    alloc.cap_comm = 0;
+    alloc.h_comm_pinned = false;
+    alloc.cap_h_comm = 0;
     return alloc;
 }
 
@@ -523,16 +544,22 @@ static bool has_step_allocation(const StepAllocation &alloc)
 
 static StepAllocation detach_step_allocation(StepContext &ctx)
 {
-    StepAllocation alloc = {
-        ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem,
-        ctx.d_state_next_mem, ctx.d_matter_mem,
-        ctx.d_comm_mem, ctx.h_comm_mem,
-        ctx.cap_all, ctx.cap_comm, ctx.h_comm_pinned, ctx.cap_h_comm
-    };
+    StepAllocation alloc = {};
+    alloc.d_state0_mem = ctx.d_state0_mem;
+    alloc.d_accum_mem = ctx.d_accum_mem;
+    alloc.d_resident_mem = ctx.d_resident_mem;
+    alloc.d_matter_mem = ctx.d_matter_mem;
+    alloc.d_comm_mem = ctx.d_comm_mem;
+    alloc.h_comm_mem = ctx.h_comm_mem;
+    alloc.cap_all = ctx.cap_all;
+    alloc.cap_comm = ctx.cap_comm;
+    alloc.h_comm_pinned = ctx.h_comm_pinned;
+    alloc.cap_h_comm = ctx.cap_h_comm;
     ctx.d_state0_mem = nullptr;
     ctx.d_accum_mem = nullptr;
     ctx.d_state_curr_mem = nullptr;
     ctx.d_state_next_mem = nullptr;
+    ctx.d_resident_mem.fill(nullptr);
     ctx.d_matter_mem = nullptr;
     ctx.d_comm_mem = nullptr;
     ctx.h_comm_mem = nullptr;
@@ -542,13 +569,20 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     ctx.cap_h_comm = 0;
     ctx.matter_ready = false;
     ctx.state_ready = false;
+    ctx.current_bank = -1;
+    ctx.resident_clock = 0;
     ctx.d_state0.fill(nullptr);
     ctx.d_accum.fill(nullptr);
     ctx.d_state_curr.fill(nullptr);
     ctx.d_state_next.fill(nullptr);
     ctx.d_matter.fill(nullptr);
-    ctx.resident_host.fill(nullptr);
-    ctx.resident_host_valid = false;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        ctx.d_resident[b].fill(nullptr);
+        ctx.resident_host[b].fill(nullptr);
+        ctx.resident_host_clean[b].fill(0);
+    }
+    ctx.resident_age.fill(0);
+    ctx.resident_valid.fill(false);
     return alloc;
 }
 
@@ -556,8 +590,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
 {
     ctx.d_state0_mem = alloc.d_state0_mem;
     ctx.d_accum_mem = alloc.d_accum_mem;
-    ctx.d_state_curr_mem = alloc.d_state_curr_mem;
-    ctx.d_state_next_mem = alloc.d_state_next_mem;
+    ctx.d_state_curr_mem = nullptr;
+    ctx.d_state_next_mem = nullptr;
+    ctx.d_resident_mem = alloc.d_resident_mem;
     ctx.d_matter_mem = alloc.d_matter_mem;
     ctx.d_comm_mem = alloc.d_comm_mem;
     ctx.h_comm_mem = alloc.h_comm_mem;
@@ -567,8 +602,14 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
     ctx.cap_h_comm = alloc.cap_h_comm;
     ctx.matter_ready = false;
     ctx.state_ready = false;
-    ctx.resident_host.fill(nullptr);
-    ctx.resident_host_valid = false;
+    ctx.current_bank = -1;
+    ctx.resident_clock = 0;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        ctx.resident_host[b].fill(nullptr);
+        ctx.resident_host_clean[b].fill(0);
+    }
+    ctx.resident_age.fill(0);
+    ctx.resident_valid.fill(false);
 }
 
 static void recycle_step_allocation(StepAllocation &alloc)
@@ -657,8 +698,10 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
         if (!has_step_allocation(alloc)) {
             CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
             CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-            CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-            CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+                CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b],
+                                      BSSN_STATE_COUNT * all * sizeof(double)));
+            }
             CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
             alloc.cap_all = all;
         }
@@ -667,8 +710,13 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
         ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
-        ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all;
-        ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all;
+        for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+            ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
+        }
+    }
+    if (ctx.current_bank >= 0) {
+        ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
+        ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
     }
     for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
         ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
@@ -5467,6 +5515,25 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
     }
 }
 
+static bool resident_key_usable(double **host_key);
+static int find_resident_bank(const StepContext &ctx, double **host_key);
+static void set_resident_host_clean(StepContext &ctx, int bank, bool clean);
+static bool resident_host_subset_clean(const StepContext &ctx,
+                                       int bank,
+                                       int subset_count,
+                                       const int *state_indices);
+static void mark_resident_host_subset_clean(StepContext &ctx,
+                                            int bank,
+                                            int subset_count,
+                                            const int *state_indices,
+                                            bool clean);
+static void mark_resident_current_bank(StepContext &ctx, int bank);
+static void update_state_ready(StepContext &ctx);
+static int active_or_keyed_bank(StepContext &ctx,
+                                double **host_key,
+                                size_t all,
+                                bool upload_if_missing);
+
 static void copy_state_region_cuda(void *block_tag,
                                    int state_index,
                                    double *host_state,
@@ -5494,6 +5561,12 @@ static void copy_state_region_cuda(void *block_tag,
         p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
     }
     CUDA_CHECK(cudaMemcpy3D(&p));
+    if (kind == cudaMemcpyHostToDevice && ctx.current_bank >= 0) {
+        ctx.resident_valid[ctx.current_bank] = true;
+        ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock;
+        ctx.resident_host_clean[ctx.current_bank][state_index] = 0;
+        update_state_ready(ctx);
+    }
 }
 
 static void copy_state_region_packed_cuda(void *block_tag,
@@ -5527,6 +5600,12 @@ static void copy_state_region_packed_cuda(void *block_tag,
     }
 
     CUDA_CHECK(cudaMemcpy3D(&p));
+    if (kind == cudaMemcpyHostToDevice && ctx.current_bank >= 0) {
+        ctx.resident_valid[ctx.current_bank] = true;
+        ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock;
+        ctx.resident_host_clean[ctx.current_bank][state_index] = 0;
+        update_state_ready(ctx);
+    }
 }
 
 static void copy_state_region_packed_batch_cuda(void *block_tag,
@@ -5565,6 +5644,12 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
             ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
             i0, j0, k0, sx, sy, sz, region_all, state_count,
             ex[0] * ex[1] * ex[2]);
+        if (ctx.current_bank >= 0) {
+            ctx.resident_valid[ctx.current_bank] = true;
+            ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock;
+            mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false);
+            update_state_ready(ctx);
+        }
     }
 }
 
@@ -5594,7 +5679,14 @@ static void copy_state_region_packed_batch_device_cuda(void *block_tag,
             ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
             i0, j0, k0, sx, sy, sz, region_all, state_count,
             ex[0] * ex[1] * ex[2]);
-        ctx.state_ready = true;
+        if (ctx.current_bank >= 0) {
+            ctx.resident_valid[ctx.current_bank] = true;
+            ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock;
+            mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false);
+            mark_resident_current_bank(ctx, ctx.current_bank);
+        } else {
+            ctx.state_ready = true;
+        }
     }
 }
 
@@ -5636,7 +5728,14 @@ static void copy_state_device_segments(void *block_tag,
             ctx.d_state_curr_mem, device_buffer,
             ex[0], ex[1], d_meta, state_count,
             ex[0] * ex[1] * ex[2]);
-        ctx.state_ready = true;
+        if (ctx.current_bank >= 0) {
+            ctx.resident_valid[ctx.current_bank] = true;
+            ctx.resident_age[ctx.current_bank] = ++ctx.resident_clock;
+            mark_resident_host_subset_clean(ctx, ctx.current_bank, state_count, nullptr, false);
+            mark_resident_current_bank(ctx, ctx.current_bank);
+        } else {
+            ctx.state_ready = true;
+        }
     }
 }
 
@@ -5715,6 +5814,14 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    int bank = find_resident_bank(ctx, state_host_out);
+    if (bank < 0)
+        bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank])
+        return;
+    mark_resident_current_bank(ctx, bank);
+    if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr))
+        return;
     static int direct_download = -1;
     if (direct_download < 0) {
         const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
@@ -5722,18 +5829,20 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     }
     if (direct_download) {
         for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
-            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i],
+            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                        bytes, cudaMemcpyDeviceToHost));
         }
         CUDA_CHECK(cudaDeviceSynchronize());
+        set_resident_host_clean(ctx, bank, true);
         return;
     }
-    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
+    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
                           (size_t)BSSN_STATE_COUNT * bytes,
                           cudaMemcpyDeviceToHost));
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
     }
+    set_resident_host_clean(ctx, bank, true);
 }
 
 static void copy_state_subset(void *block_tag,
@@ -5764,14 +5873,26 @@ static void copy_state_subset(void *block_tag,
     const size_t total_doubles = (size_t)active_count * all;
     double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
     double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles);
+    double *full_key[BSSN_STATE_COUNT];
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i)
+        full_key[i] = nullptr;
+    for (int i = 0; i < active_count; ++i)
+        full_key[active_state_indices[i]] = active_state_host[i];
+    int bank = active_or_keyed_bank(ctx, nullptr, all, kind == cudaMemcpyHostToDevice);
+    if (kind == cudaMemcpyHostToDevice && resident_key_usable(full_key))
+        bank = active_or_keyed_bank(ctx, full_key, all, false);
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT)
+        return;
 
     CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices,
                                   (size_t)active_count * sizeof(int),
                                   0, cudaMemcpyHostToDevice));
     if (kind == cudaMemcpyDeviceToHost) {
+        if (resident_host_subset_clean(ctx, bank, active_count, active_state_indices))
+            return;
         dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
         kern_pack_state_subset<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, active_count, (int)all);
+            ctx.d_resident_mem[bank], d_comm, active_count, (int)all);
         CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
                               total_doubles * sizeof(double),
                               cudaMemcpyDeviceToHost));
@@ -5780,6 +5901,7 @@ static void copy_state_subset(void *block_tag,
                         h_comm + (size_t)i * all,
                         bytes);
         }
+        mark_resident_host_subset_clean(ctx, bank, active_count, active_state_indices, true);
     } else {
         for (int i = 0; i < active_count; ++i) {
             std::memcpy(h_comm + (size_t)i * all,
@@ -5791,14 +5913,22 @@ static void copy_state_subset(void *block_tag,
                               cudaMemcpyHostToDevice));
         dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
         kern_unpack_state_subset<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, active_count, (int)all);
+            ctx.d_resident_mem[bank], d_comm, active_count, (int)all);
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        mark_resident_host_subset_clean(ctx, bank, active_count, active_state_indices, true);
     }
 }
 
 static bool has_resident_state(void *block_tag)
 {
     auto it = g_step_ctx.find(block_tag);
-    return it != g_step_ctx.end() && it->second.state_ready;
+    if (it == g_step_ctx.end()) return false;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (it->second.resident_valid[b]) return true;
+    }
+    return false;
 }
 
 static bool resident_key_usable(double **host_key)
@@ -5810,26 +5940,375 @@ static bool resident_key_usable(double **host_key)
     return true;
 }
 
-static bool resident_key_matches(const StepContext &ctx, double **host_key)
+static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
 {
-    if (!ctx.state_ready || !ctx.resident_host_valid || !resident_key_usable(host_key))
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !resident_key_usable(host_key))
         return false;
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
-        if (ctx.resident_host[i] != host_key[i]) return false;
+        if (ctx.resident_host[bank][i] != host_key[i]) return false;
     }
     return true;
 }
 
-static void set_resident_key(StepContext &ctx, double **host_key)
+static int find_resident_bank(const StepContext &ctx, double **host_key)
+{
+    if (!host_key) return -1;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (resident_key_matches(ctx, b, host_key))
+            return b;
+    }
+    return -1;
+}
+
+static void set_resident_host_clean(StepContext &ctx, int bank, bool clean)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return;
+    ctx.resident_host_clean[bank].fill(clean ? 1 : 0);
+}
+
+static bool resident_host_subset_clean(const StepContext &ctx,
+                                       int bank,
+                                       int subset_count,
+                                       const int *state_indices)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return false;
+    for (int i = 0; i < subset_count; ++i) {
+        const int state_index = state_indices ? state_indices[i] : i;
+        if (state_index < 0 || state_index >= BSSN_STATE_COUNT)
+            return false;
+        if (!ctx.resident_host_clean[bank][state_index])
+            return false;
+    }
+    return true;
+}
+
+static void mark_resident_host_subset_clean(StepContext &ctx,
+                                            int bank,
+                                            int subset_count,
+                                            const int *state_indices,
+                                            bool clean)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return;
+    for (int i = 0; i < subset_count; ++i) {
+        const int state_index = state_indices ? state_indices[i] : i;
+        if (state_index >= 0 && state_index < BSSN_STATE_COUNT)
+            ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
+    }
+}
+
+static void mark_resident_current_bank(StepContext &ctx, int bank)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return;
+    ctx.current_bank = bank;
+    ctx.d_state_curr_mem = ctx.d_resident_mem[bank];
+    ctx.d_state_curr = ctx.d_resident[bank];
+    ctx.state_ready = ctx.resident_valid[bank];
+}
+
+static void mark_resident_next_bank(StepContext &ctx, int bank)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return;
+    ctx.d_state_next_mem = ctx.d_resident_mem[bank];
+    ctx.d_state_next = ctx.d_resident[bank];
+}
+
+static bool any_resident_bank_valid(const StepContext &ctx)
+{
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (ctx.resident_valid[b]) return true;
+    }
+    return false;
+}
+
+static void update_state_ready(StepContext &ctx)
+{
+    ctx.state_ready = any_resident_bank_valid(ctx);
+}
+
+static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT) return;
+    if (!ctx.resident_valid[bank]) return;
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i)
+        if (!ctx.resident_host[bank][i]) return;
+    if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr))
+        return;
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i],
+                                   ctx.d_resident[bank][i],
+                                   bytes, cudaMemcpyDeviceToHost));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+    set_resident_host_clean(ctx, bank, true);
+}
+
+static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
+{
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (b != avoid_bank && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (b == avoid_bank) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0) best = 0;
+    writeback_resident_bank(ctx, best, all);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_host_clean[best].fill(0);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
+static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
+{
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !resident_key_usable(host_key))
+        return;
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i)
+        ctx.resident_host[bank][i] = host_key[i];
+    set_resident_host_clean(ctx, bank, false);
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+}
+
+static int ensure_resident_bank(StepContext &ctx,
+                                double **host_key,
+                                size_t all,
+                                bool upload_if_missing,
+                                int avoid_bank = -1)
 {
     if (!resident_key_usable(host_key)) {
-        ctx.resident_host.fill(nullptr);
-        ctx.resident_host_valid = false;
-        return;
+        if (ctx.current_bank >= 0)
+            return ctx.current_bank;
+        return 0;
     }
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i)
-        ctx.resident_host[i] = host_key[i];
-    ctx.resident_host_valid = true;
+
+    int bank = find_resident_bank(ctx, host_key);
+    if (bank >= 0) {
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        if (!ctx.resident_valid[bank] && upload_if_missing) {
+            bind_state_input_slots(ctx.d_resident[bank]);
+            upload_state_inputs(host_key, all);
+            ctx.resident_valid[bank] = true;
+            set_resident_host_clean(ctx, bank, true);
+        }
+        return bank;
+    }
+
+    bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all);
+    assign_resident_key(ctx, bank, host_key);
+    if (upload_if_missing) {
+        bind_state_input_slots(ctx.d_resident[bank]);
+        upload_state_inputs(host_key, all);
+        ctx.resident_valid[bank] = true;
+        set_resident_host_clean(ctx, bank, true);
+    } else {
+        ctx.resident_valid[bank] = false;
+        set_resident_host_clean(ctx, bank, false);
+    }
+    update_state_ready(ctx);
+    return bank;
+}
+
+static int reserve_resident_output_bank(StepContext &ctx,
+                                        double **host_key,
+                                        size_t all,
+                                        int input_bank)
+{
+    if (!resident_key_usable(host_key))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches(ctx, input_bank, host_key))
+        return input_bank;
+
+    int bank = find_resident_bank(ctx, host_key);
+    if (bank < 0)
+        bank = choose_resident_bank_for_reuse(ctx, input_bank, all);
+    assign_resident_key(ctx, bank, host_key);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
+static int active_or_keyed_bank(StepContext &ctx,
+                                double **host_key,
+                                size_t all,
+                                bool upload_if_missing)
+{
+    if (resident_key_usable(host_key)) {
+        const int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
+        mark_resident_current_bank(ctx, bank);
+        return bank;
+    }
+    if (ctx.current_bank >= 0)
+        return ctx.current_bank;
+    for (int b = 0; b < Z4C_RESIDENT_BANK_COUNT; ++b) {
+        if (ctx.resident_valid[b]) {
+            mark_resident_current_bank(ctx, b);
+            return b;
+        }
+    }
+    return 0;
+}
+
+__device__ double interp_lagrange_weight(int idx, double x, int ordn)
+{
+    double w = 1.0;
+    const double xi = (double)idx;
+    for (int j = 0; j < ordn; ++j) {
+        if (j == idx) continue;
+        w *= (x - (double)j) / (xi - (double)j);
+    }
+    return w;
+}
+
+__device__ void interp_axis_window(double p,
+                                   double x0,
+                                   double dx,
+                                   int n,
+                                   int ordn,
+                                   int symmetry,
+                                   int axis,
+                                   int &base,
+                                   double &local_x)
+{
+    int cx_i = (int)((p - x0) / dx + 0.4) + 1;
+    int cx_b = cx_i - ordn / 2 + 1;
+    int cx_t = cx_b + ordn - 1;
+    int cmin = 1;
+    if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
+        cmin = -ordn / 2 + 1;
+    if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
+        cmin = -ordn / 2 + 1;
+
+    if (cx_b < cmin) {
+        cx_b = cmin;
+        cx_t = cx_b + ordn - 1;
+    }
+    if (cx_t > n) {
+        cx_t = n;
+        cx_b = cx_t + 1 - ordn;
+    }
+
+    base = cx_b;
+    if (cx_b > 0) {
+        const double xb = x0 + (double)(cx_b - 1) * dx;
+        local_x = (p - xb) / dx;
+    } else {
+        const int reflected = 1 - cx_b;
+        const double xb = x0 + (double)(reflected - 1) * dx;
+        local_x = (p + xb) / dx;
+    }
+}
+
+__device__ double load_interp_value(const double * __restrict__ mem,
+                                    int nx,
+                                    int ny,
+                                    int nz,
+                                    int all,
+                                    int state,
+                                    int fi,
+                                    int fj,
+                                    int fk,
+                                    const double * __restrict__ soa)
+{
+    double sign = 1.0;
+    int ii = fi;
+    int jj = fj;
+    int kk = fk;
+    if (ii <= 0) {
+        ii = 1 - ii;
+        sign *= soa[0];
+    }
+    if (jj <= 0) {
+        jj = 1 - jj;
+        sign *= soa[1];
+    }
+    if (kk <= 0) {
+        kk = 1 - kk;
+        sign *= soa[2];
+    }
+    if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
+        return 0.0;
+    const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
+    return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
+}
+
+__global__ void kern_interp_state_point3(const double * __restrict__ mem,
+                                         double * __restrict__ out,
+                                         int nx,
+                                         int ny,
+                                         int nz,
+                                         int all,
+                                         int state0,
+                                         int state1,
+                                         int state2,
+                                         double x0,
+                                         double y0,
+                                         double z0,
+                                         double dx,
+                                         double dy,
+                                         double dz,
+                                         double px,
+                                         double py,
+                                         double pz,
+                                         int ordn,
+                                         int symmetry,
+                                         double soa00, double soa01, double soa02,
+                                         double soa10, double soa11, double soa12,
+                                         double soa20, double soa21, double soa22)
+{
+    const int f = threadIdx.x;
+    if (f >= 3 || ordn <= 0 || ordn > 8)
+        return;
+
+    const int states[3] = {state0, state1, state2};
+    const double soa_all[9] = {
+        soa00, soa01, soa02,
+        soa10, soa11, soa12,
+        soa20, soa21, soa22
+    };
+    const double *soa = soa_all + 3 * f;
+
+    int ib, jb, kb;
+    double tx, ty, tz;
+    interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
+    interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
+    interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
+
+    double wx[8], wy[8], wz[8];
+    for (int i = 0; i < ordn; ++i) {
+        wx[i] = interp_lagrange_weight(i, tx, ordn);
+        wy[i] = interp_lagrange_weight(i, ty, ordn);
+        wz[i] = interp_lagrange_weight(i, tz, ordn);
+    }
+
+    double value = 0.0;
+    for (int k = 0; k < ordn; ++k) {
+        for (int j = 0; j < ordn; ++j) {
+            for (int i = 0; i < ordn; ++i) {
+                const double coeff = wx[i] * wy[j] * wz[k];
+                value += coeff * load_interp_value(mem, nx, ny, nz, all,
+                                                    states[f],
+                                                    ib + i, jb + j, kb + k,
+                                                    soa);
+            }
+        }
+    }
+    out[f] = value;
 }
 
 
@@ -7788,8 +8267,6 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
                                     double &chitiny)
 {
     (void)T;
-    (void)state_host_out;
-
     if (RK4 < 0 || RK4 > 3) return 1;
 
     using namespace z4c_cuda;
@@ -7820,23 +8297,27 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
 
     StepContext &ctx = ensure_step_ctx(block_tag, all);
     const bool use_resident_state = (keep_resident_state != 0);
+    int input_bank = -1;
+    int output_bank = -1;
     if (use_resident_state) {
-        bind_state_input_slots(ctx.d_state_curr);
-        bind_state_output_slots(ctx.d_state_next);
+        input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
+        output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank);
+        mark_resident_current_bank(ctx, input_bank);
+        mark_resident_next_bank(ctx, output_bank);
+        bind_state_input_slots(ctx.d_resident[input_bank]);
+        bind_state_output_slots(ctx.d_resident[output_bank]);
     }
     double t0 = profile ? cuda_profile_now_ms() : 0.0;
-    if (!use_resident_state || !ctx.state_ready) {
+    if (!use_resident_state) {
         upload_state_inputs(state_host_in, all);
-        if (use_resident_state) {
-            ctx.state_ready = true;
-            set_resident_key(ctx, state_host_in);
-        }
     }
     if (apply_enforce_ga) {
         kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
                                                  g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
                                                  g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
                                                  g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
+        if (use_resident_state && input_bank >= 0)
+            set_resident_host_clean(ctx, input_bank, false);
     }
     if (RK4 == 0) {
         CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
@@ -7887,10 +8368,10 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
 
     t0 = profile ? cuda_profile_now_ms() : 0.0;
     if (use_resident_state) {
-        std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
-        ctx.d_state_curr.swap(ctx.d_state_next);
-        ctx.state_ready = true;
-        set_resident_key(ctx, state_host_out);
+        ctx.resident_valid[output_bank] = true;
+        ctx.resident_age[output_bank] = ++ctx.resident_clock;
+        set_resident_host_clean(ctx, output_bank, false);
+        mark_resident_current_bank(ctx, output_bank);
     } else {
         download_state_outputs(state_host_out, all);
     }
@@ -7931,6 +8412,8 @@ extern "C" int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
     copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                   i0, j0, k0, sx, sy, sz,
                                   cudaMemcpyDeviceToHost);
@@ -7947,6 +8430,8 @@ extern "C" int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
     copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                   i0, j0, k0, sx, sy, sz,
                                   cudaMemcpyHostToDevice);
@@ -7963,6 +8448,27 @@ extern "C" int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
+                                        i0, j0, k0, sx, sy, sz,
+                                        cudaMemcpyDeviceToHost);
+    return 0;
+}
+
+extern "C" int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                                       double **state_host_key,
+                                                                       int state_count,
+                                                                       double *host_buffer,
+                                                                       int *ex,
+                                                                       int i0, int j0, int k0,
+                                                                       int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                         i0, j0, k0, sx, sy, sz,
                                         cudaMemcpyDeviceToHost);
@@ -7979,6 +8485,29 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
+                                        i0, j0, k0, sx, sy, sz,
+                                        cudaMemcpyHostToDevice);
+    return 0;
+}
+
+extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
+                                                                           double **state_host_key,
+                                                                           int state_count,
+                                                                           double *host_buffer,
+                                                                           int *ex,
+                                                                           int i0, int j0, int k0,
+                                                                           int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const int bank = ensure_resident_bank(ctx, state_host_key,
+                                          (size_t)ex[0] * ex[1] * ex[2], false);
+    mark_resident_current_bank(ctx, bank);
     copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                         i0, j0, k0, sx, sy, sz,
                                         cudaMemcpyHostToDevice);
@@ -7995,6 +8524,26 @@ extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
+                                               i0, j0, k0, sx, sy, sz, 1);
+    return 0;
+}
+
+extern "C" int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                         double **state_host_key,
+                                                                         int state_count,
+                                                                         double *device_buffer,
+                                                                         int *ex,
+                                                                         int i0, int j0, int k0,
+                                                                         int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
                                                i0, j0, k0, sx, sy, sz, 1);
     return 0;
@@ -8010,6 +8559,28 @@ extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
+                                               i0, j0, k0, sx, sy, sz, 0);
+    return 0;
+}
+
+extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
+                                                                             double **state_host_key,
+                                                                             int state_count,
+                                                                             double *device_buffer,
+                                                                             int *ex,
+                                                                             int i0, int j0, int k0,
+                                                                             int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const int bank = ensure_resident_bank(ctx, state_host_key,
+                                          (size_t)ex[0] * ex[1] * ex[2], false);
+    mark_resident_current_bank(ctx, bank);
     copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
                                                i0, j0, k0, sx, sy, sz, 0);
     return 0;
@@ -8025,6 +8596,26 @@ extern "C" int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 1);
+    return 0;
+}
+
+extern "C" int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                            double **state_host_key,
+                                                                            int state_count,
+                                                                            double *device_buffer,
+                                                                            int *ex,
+                                                                            int segment_count,
+                                                                            const int *segment_meta)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                                segment_count, segment_meta, 1);
     return 0;
@@ -8040,6 +8631,28 @@ extern "C" int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 0);
+    return 0;
+}
+
+extern "C" int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
+                                                                                double **state_host_key,
+                                                                                int state_count,
+                                                                                double *device_buffer,
+                                                                                int *ex,
+                                                                                int segment_count,
+                                                                                const int *segment_meta)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const int bank = ensure_resident_bank(ctx, state_host_key,
+                                          (size_t)ex[0] * ex[1] * ex[2], false);
+    mark_resident_current_bank(ctx, bank);
     copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                                segment_count, segment_meta, 0);
     return 0;
@@ -8056,6 +8669,27 @@ extern "C" int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
+                                   segment_count, segment_meta, state_soa);
+    return 0;
+}
+
+extern "C" int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                                double **state_host_key,
+                                                                                int state_count,
+                                                                                double *device_buffer,
+                                                                                int *ex,
+                                                                                int segment_count,
+                                                                                const int *segment_meta,
+                                                                                const double *state_soa)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
                                    segment_count, segment_meta, state_soa);
     return 0;
@@ -8072,6 +8706,27 @@ extern "C" int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
     using namespace z4c_cuda;
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
+                                  segment_count, segment_meta, state_soa);
+    return 0;
+}
+
+extern "C" int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                               double **state_host_key,
+                                                                               int state_count,
+                                                                               double *device_buffer,
+                                                                               int *ex,
+                                                                               int segment_count,
+                                                                               const int *segment_meta,
+                                                                               const double *state_soa)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
                                   segment_count, segment_meta, state_soa);
     return 0;
@@ -8091,6 +8746,35 @@ extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
     if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    const int region_all = sx * sy * sz;
+    upload_comm_state_soa(state_soa, state_count);
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_state_curr_mem, device_buffer,
+        ex[0], ex[1], sx, sy, sz,
+        fi0, fj0, fk0, region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
+extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                             double **state_host_key,
+                                                                             int state_count,
+                                                                             double *device_buffer,
+                                                                             int *ex,
+                                                                             int sx, int sy, int sz,
+                                                                             int fi0, int fj0, int fk0,
+                                                                             const double *state_soa)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     const int region_all = sx * sy * sz;
     upload_comm_state_soa(state_soa, state_count);
     dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8118,6 +8802,37 @@ extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
     if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, nullptr, (size_t)ex[0] * ex[1] * ex[2], false);
+    const int region_all = sx * sy * sz;
+    upload_comm_state_soa(state_soa, state_count);
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_state_curr_mem, device_buffer,
+        ex[0], ex[1], sx, sy, sz,
+        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
+        region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
+extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                            double **state_host_key,
+                                                                            int state_count,
+                                                                            double *device_buffer,
+                                                                            int *ex,
+                                                                            int sx, int sy, int sz,
+                                                                            int ii0, int jj0, int kk0,
+                                                                            int lbc_i, int lbc_j, int lbc_k,
+                                                                            const double *state_soa)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    active_or_keyed_bank(ctx, state_host_key, (size_t)ex[0] * ex[1] * ex[2], true);
     const int region_all = sx * sy * sz;
     upload_comm_state_soa(state_soa, state_count);
     dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8169,11 +8884,66 @@ extern "C" int z4c_cuda_compute_constraints_resident(void *block_tag,
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     if (!block_tag || !ex || !constraint_host_out) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
-    if (!ctx.state_ready) return 1;
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank]) return 1;
     setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
     bind_state_input_slots(ctx.d_state_curr);
-    launch_z4c_rhs_pipeline((int)((size_t)ex[0] * ex[1] * ex[2]), eps);
-    download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]);
+    launch_z4c_rhs_pipeline((int)all, eps);
+    download_constraint_outputs(constraint_host_out, all);
+    return 0;
+}
+
+extern "C" int z4c_cuda_interp_state_point3(void *block_tag,
+                                            int *ex,
+                                            int state0,
+                                            int state1,
+                                            int state2,
+                                            double x0,
+                                            double y0,
+                                            double z0,
+                                            double dx,
+                                            double dy,
+                                            double dz,
+                                            double px,
+                                            double py,
+                                            double pz,
+                                            int ordn,
+                                            int symmetry,
+                                            const double *soa3,
+                                            double *out3)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (!block_tag || !ex || !out3 || !soa3)
+        return 1;
+    if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
+        state1 < 0 || state1 >= BSSN_STATE_COUNT ||
+        state2 < 0 || state2 >= BSSN_STATE_COUNT)
+        return 1;
+    if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
+        ordn <= 0 || ordn > 8 ||
+        ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
+        return 1;
+
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
+    if (bank < 0 || bank >= Z4C_RESIDENT_BANK_COUNT || !ctx.resident_valid[bank])
+        return 1;
+
+    double *d_out = ensure_step_comm_buffer(ctx, 3);
+    kern_interp_state_point3<<<1, 3>>>(
+        ctx.d_resident_mem[bank], d_out,
+        ex[0], ex[1], ex[2], (int)all,
+        state0, state1, state2,
+        x0, y0, z0, dx, dy, dz,
+        px, py, pz, ordn, symmetry,
+        soa3[0], soa3[1], soa3[2],
+        soa3[3], soa3[4], soa3[5],
+        soa3[6], soa3[7], soa3[8]);
+    CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
     return 0;
 }
 
@@ -8204,7 +8974,8 @@ extern "C" int z4c_cuda_resident_state_matches(void *block_tag,
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     auto it = g_step_ctx.find(block_tag);
     if (it == g_step_ctx.end()) return 0;
-    return resident_key_matches(it->second, state_host_key) ? 1 : 0;
+    const int bank = find_resident_bank(it->second, state_host_key);
+    return (bank >= 0 && it->second.resident_valid[bank]) ? 1 : 0;
 }
 
 extern "C" void z4c_cuda_release_step_ctx(void *block_tag)
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h
index 2f81925..4a9242d 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -53,6 +53,14 @@ int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz);
 
+int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                            double **state_host_key,
+                                                            int state_count,
+                                                            double *host_buffer,
+                                                            int *ex,
+                                                            int i0, int j0, int k0,
+                                                            int sx, int sy, int sz);
+
 int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
@@ -60,6 +68,14 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 
+int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
+                                                                double **state_host_key,
+                                                                int state_count,
+                                                                double *host_buffer,
+                                                                int *ex,
+                                                                int i0, int j0, int k0,
+                                                                int sx, int sy, int sz);
+
 int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                int state_count,
                                                double *device_buffer,
@@ -67,6 +83,14 @@ int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz);
 
+int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                              double **state_host_key,
+                                                              int state_count,
+                                                              double *device_buffer,
+                                                              int *ex,
+                                                              int i0, int j0, int k0,
+                                                              int sx, int sy, int sz);
+
 int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
@@ -74,6 +98,14 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                    int i0, int j0, int k0,
                                                    int sx, int sy, int sz);
 
+int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int i0, int j0, int k0,
+                                                                  int sx, int sy, int sz);
+
 int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
@@ -81,6 +113,14 @@ int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int segment_count,
                                                   const int *segment_meta);
 
+int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                 double **state_host_key,
+                                                                 int state_count,
+                                                                 double *device_buffer,
+                                                                 int *ex,
+                                                                 int segment_count,
+                                                                 const int *segment_meta);
+
 int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
@@ -88,6 +128,14 @@ int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int segment_count,
                                                       const int *segment_meta);
 
+int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
+                                                                     double **state_host_key,
+                                                                     int state_count,
+                                                                     double *device_buffer,
+                                                                     int *ex,
+                                                                     int segment_count,
+                                                                     const int *segment_meta);
+
 int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
@@ -96,6 +144,15 @@ int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       const int *segment_meta,
                                                       const double *state_soa);
 
+int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                     double **state_host_key,
+                                                                     int state_count,
+                                                                     double *device_buffer,
+                                                                     int *ex,
+                                                                     int segment_count,
+                                                                     const int *segment_meta,
+                                                                     const double *state_soa);
+
 int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
@@ -104,6 +161,15 @@ int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      const int *segment_meta,
                                                      const double *state_soa);
 
+int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                    double **state_host_key,
+                                                                    int state_count,
+                                                                    double *device_buffer,
+                                                                    int *ex,
+                                                                    int segment_count,
+                                                                    const int *segment_meta,
+                                                                    const double *state_soa);
+
 int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
@@ -112,6 +178,15 @@ int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int fi0, int fj0, int fk0,
                                                    const double *state_soa);
 
+int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int sx, int sy, int sz,
+                                                                  int fi0, int fj0, int fk0,
+                                                                  const double *state_soa);
+
 int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
@@ -121,6 +196,16 @@ int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int lbc_i, int lbc_j, int lbc_k,
                                                   const double *state_soa);
 
+int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                 double **state_host_key,
+                                                                 int state_count,
+                                                                 double *device_buffer,
+                                                                 int *ex,
+                                                                 int sx, int sy, int sz,
+                                                                 int ii0, int jj0, int kk0,
+                                                                 int lbc_i, int lbc_j, int lbc_k,
+                                                                 const double *state_soa);
+
 int z4c_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,
@@ -138,6 +223,25 @@ int z4c_cuda_compute_constraints_resident(void *block_tag,
                                           int Symmetry, double eps, int co,
                                           double **constraint_host_out);
 
+int z4c_cuda_interp_state_point3(void *block_tag,
+                                 int *ex,
+                                 int state0,
+                                 int state1,
+                                 int state2,
+                                 double x0,
+                                 double y0,
+                                 double z0,
+                                 double dx,
+                                 double dy,
+                                 double dz,
+                                 double px,
+                                 double py,
+                                 double pz,
+                                 int ordn,
+                                 int symmetry,
+                                 const double *soa3,
+                                 double *out3);
+
 int z4c_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out);