From 1ee229a91f1b1890bbf106bb9feb90fa287a4a8b Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Wed, 29 Apr 2026 19:44:19 +0800
Subject: [PATCH] Add keyed BSSN CUDA resident banks

---
 AMSS_NCKU_source/Parallel.C       | 194 ++++++--
 AMSS_NCKU_source/bssn_class.C     |  13 +-
 AMSS_NCKU_source/bssn_rhs_cuda.cu | 718 ++++++++++++++++++++++++++++--
 AMSS_NCKU_source/bssn_rhs_cuda.h  |  74 +++
 4 files changed, 910 insertions(+), 89 deletions(-)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 79a89fe..ead3566 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -172,6 +172,26 @@ int cuda_state_var_count(MyList<var> *src_vars, MyList<var> *dst_vars)
   return (src_vars || dst_vars) ? -1 : count;
 }
 
+#if USE_CUDA_BSSN
+bool cuda_build_bssn_host_views(Block *block,
+                                MyList<var> *vars,
+                                int state_count,
+                                double **views)
+{
+  if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT)
+    return false;
+  MyList<var> *v = vars;
+  for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
+  {
+    if (!v)
+      return false;
+    views[i] = block->fgfs[v->data->sgfn];
+    v = v->next;
+  }
+  return v == 0;
+}
+#endif
+
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
 int fortran_idint(double x)
 {
@@ -318,7 +338,8 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
 bool cuda_direct_pack_segment(double *buffer,
                               const Parallel::gridseg *src,
                               const Parallel::gridseg *dst,
-                              int state_count)
+                              int state_count,
+                              MyList<var> *VarLists)
 {
 #if USE_CUDA_Z4C && (ABEtype == 2)
   if (state_count != Z4C_CUDA_STATE_COUNT)
@@ -338,9 +359,17 @@ bool cuda_direct_pack_segment(double *buffer,
                                                            i0, j0, k0,
                                                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
-  const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
-                                                            i0, j0, k0,
-                                                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  double *views[BSSN_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
+  const bool ok = have_views
+                      ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(
+                            src->Bg, views, state_count, buffer, src->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                      : bssn_cuda_pack_state_batch_to_host_buffer(
+                            src->Bg, state_count, buffer, src->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #endif
   if (sync_profile_enabled())
     sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -349,7 +378,8 @@ bool cuda_direct_pack_segment(double *buffer,
 
 bool cuda_direct_unpack_segment(double *buffer,
                                 const Parallel::gridseg *dst,
-                                int state_count)
+                                int state_count,
+                                MyList<var> *VarListd)
 {
 #if USE_CUDA_Z4C && (ABEtype == 2)
   if (state_count != Z4C_CUDA_STATE_COUNT)
@@ -369,9 +399,17 @@ bool cuda_direct_unpack_segment(double *buffer,
                                                                i0, j0, k0,
                                                                dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
-  const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
-                                                                i0, j0, k0,
-                                                                dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  double *views[BSSN_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
+  const bool ok = have_views
+                      ? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(
+                            dst->Bg, views, state_count, buffer, dst->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                      : bssn_cuda_unpack_state_batch_from_host_buffer(
+                            dst->Bg, state_count, buffer, dst->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #endif
   if (sync_profile_enabled())
     sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
@@ -441,43 +479,62 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
                                         const Parallel::gridseg *src,
                                         const Parallel::gridseg *dst,
                                         int state_count,
-                                        int type)
+                                        int type,
+                                        MyList<var> *VarLists)
 {
 #if USE_CUDA_BSSN
   if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
     return false;
   const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
   bool ok = false;
+  double *views[BSSN_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
   if (type == 1)
   {
     const int i0 = cuda_seg_begin(dst, src->Bg, 0);
     const int j0 = cuda_seg_begin(dst, src->Bg, 1);
     const int k0 = cuda_seg_begin(dst, src->Bg, 2);
-    ok = bssn_cuda_pack_state_batch_to_device_buffer(
-             src->Bg, state_count, buffer, src->Bg->shape,
-             i0, j0, k0,
-             dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    ok = have_views
+             ? bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(
+                   src->Bg, views, state_count, buffer, src->Bg->shape,
+                   i0, j0, k0,
+                   dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+             : bssn_cuda_pack_state_batch_to_device_buffer(
+                   src->Bg, state_count, buffer, src->Bg->shape,
+                   i0, j0, k0,
+                   dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
   }
   else if (type == 2)
   {
     int first_fine[3];
     if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
       return false;
-    ok = bssn_cuda_restrict_state_batch_to_device_buffer(
-             src->Bg, state_count, buffer, src->Bg->shape,
-             dst->shape[0], dst->shape[1], dst->shape[2],
-             first_fine[0], first_fine[1], first_fine[2]) == 0;
+    ok = have_views
+             ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
+                   src->Bg, views, state_count, buffer, src->Bg->shape,
+                   dst->shape[0], dst->shape[1], dst->shape[2],
+                   first_fine[0], first_fine[1], first_fine[2]) == 0
+             : bssn_cuda_restrict_state_batch_to_device_buffer(
+                   src->Bg, state_count, buffer, src->Bg->shape,
+                   dst->shape[0], dst->shape[1], dst->shape[2],
+                   first_fine[0], first_fine[1], first_fine[2]) == 0;
   }
   else if (type == 3)
   {
     int first_fine_ii[3], coarse_lb[3];
     if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
       return false;
-    ok = bssn_cuda_prolong_state_batch_to_device_buffer(
-             src->Bg, state_count, buffer, src->Bg->shape,
-             dst->shape[0], dst->shape[1], dst->shape[2],
-             first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
-             coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0;
+    ok = have_views
+             ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
+                   src->Bg, views, state_count, buffer, src->Bg->shape,
+                   dst->shape[0], dst->shape[1], dst->shape[2],
+                   first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+                   coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0
+             : bssn_cuda_prolong_state_batch_to_device_buffer(
+                   src->Bg, state_count, buffer, src->Bg->shape,
+                   dst->shape[0], dst->shape[1], dst->shape[2],
+                   first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+                   coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0;
   }
   if (sync_profile_enabled())
     sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -490,7 +547,8 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
 
 bool cuda_direct_unpack_segment_from_device(double *buffer,
                                             const Parallel::gridseg *dst,
-                                            int state_count)
+                                            int state_count,
+                                            MyList<var> *VarListd)
 {
 #if USE_CUDA_BSSN
   if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
@@ -499,10 +557,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
   const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
   const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
   const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
-  const bool ok = bssn_cuda_unpack_state_batch_from_device_buffer(
-                      dst->Bg, state_count, buffer, dst->Bg->shape,
-                      i0, j0, k0,
-                      dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  double *views[BSSN_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
+  const bool ok = have_views
+                      ? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(
+                            dst->Bg, views, state_count, buffer, dst->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
+                      : bssn_cuda_unpack_state_batch_from_device_buffer(
+                            dst->Bg, state_count, buffer, dst->Bg->shape,
+                            i0, j0, k0,
+                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
   if (sync_profile_enabled())
     sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
   return ok;
@@ -577,16 +642,25 @@ bool cuda_flush_device_segment_batch(Block *block,
                                      double *data,
                                      int state_count,
                                      const std::vector<int> &meta,
-                                     int dir)
+                                     int dir,
+                                     MyList<var> *vars)
 {
   if (!block || meta.empty())
     return true;
   const int segment_count = (int)(meta.size() / 8);
+  double *views[BSSN_CUDA_STATE_COUNT];
+  const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
   if (dir == PACK)
-    return bssn_cuda_pack_state_segments_to_device_buffer(
-               block, state_count, data, block->shape, segment_count, meta.data()) == 0;
-  return bssn_cuda_unpack_state_segments_from_device_buffer(
-             block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+    return have_views
+               ? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(
+                     block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
+               : bssn_cuda_pack_state_segments_to_device_buffer(
+                     block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+  return have_views
+             ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(
+                   block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
+             : bssn_cuda_unpack_state_segments_from_device_buffer(
+                   block, state_count, data, block->shape, segment_count, meta.data()) == 0;
 }
 
 int cuda_data_packer_device_batched(double *data,
@@ -638,7 +712,8 @@ int cuda_data_packer_device_batched(double *data,
 
       if (batch_block && batch_block != block)
       {
-        if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
+        MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
+        if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
           return -1;
         batch_meta.clear();
       }
@@ -672,7 +747,8 @@ int cuda_data_packer_device_batched(double *data,
 
   if (batch_block)
   {
-    if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
+    MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
+    if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
       return -1;
   }
   return size_out;
@@ -4542,9 +4618,9 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
               cuda_can_direct_pack(src->data, dst->data, type))
           {
             if (s_cuda_aware_pack_active) {
-              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type);
+              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists);
             } else {
-              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count);
+              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists);
             }
             if (!handled_by_cuda)
             {
@@ -4557,9 +4633,9 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
                    cuda_can_direct_unpack(dst->data, type))
           {
             if (s_cuda_aware_pack_active) {
-              handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count);
+              handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd);
             } else {
-              handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count);
+              handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count, VarListd);
             }
             if (!handled_by_cuda)
             {
@@ -6476,6 +6552,26 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
     Block *cg = BP->data;
     if (myrank == cg->rank)
     {
+#if USE_CUDA_BSSN
+      double *src1_views[BSSN_CUDA_STATE_COUNT];
+      double *src2_views[BSSN_CUDA_STATE_COUNT];
+      double *dst_views[BSSN_CUDA_STATE_COUNT];
+      const int state_count = cuda_state_var_count(VarList1, VarList2);
+      if (state_count == BSSN_CUDA_STATE_COUNT &&
+          cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
+          cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
+          cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
+          bssn_cuda_has_resident_state(cg) &&
+          bssn_cuda_prepare_inter_time_level(cg, cg->shape,
+                                             src1_views, src2_views, 0, dst_views,
+                                             2, tindex) == 0)
+      {
+        if (BP == Pat->ble)
+          break;
+        BP = BP->next;
+        continue;
+      }
+#endif
       varl1 = VarList1;
       varl2 = VarList2;
       varl3 = VarList3;
@@ -6531,6 +6627,28 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
     Block *cg = BP->data;
     if (myrank == cg->rank)
     {
+#if USE_CUDA_BSSN
+      double *src1_views[BSSN_CUDA_STATE_COUNT];
+      double *src2_views[BSSN_CUDA_STATE_COUNT];
+      double *src3_views[BSSN_CUDA_STATE_COUNT];
+      double *dst_views[BSSN_CUDA_STATE_COUNT];
+      const int state_count = cuda_state_var_count(VarList1, VarList2);
+      if (state_count == BSSN_CUDA_STATE_COUNT &&
+          cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
+          cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
+          cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
+          cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) &&
+          bssn_cuda_has_resident_state(cg) &&
+          bssn_cuda_prepare_inter_time_level(cg, cg->shape,
+                                             src1_views, src2_views, src3_views, dst_views,
+                                             3, tindex) == 0)
+      {
+        if (BP == Pat->ble)
+          break;
+        BP = BP->next;
+        continue;
+      }
+#endif
       varl1 = VarList1;
       varl2 = VarList2;
       varl3 = VarList3;
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index cdd56e3..96d1fef 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -273,6 +273,12 @@ bool bssn_cuda_use_resident_sync(int lev)
 
 bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
+  static int keep_all_levels = -1;
+  if (keep_all_levels < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS");
+    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
+  }
   static int enabled = -1;
   if (enabled < 0)
   {
@@ -281,11 +287,12 @@ bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
   }
   if (!enabled)
     return false;
-  // Levels at and above trfls can be read by CPU time interpolation through
-  // State/Old/corrector lists. Keep those conservative until multi-time-level
-  // resident storage is implemented.
   if (lev == analysis_lev)
     return false;
+  if (keep_all_levels)
+    return true;
+  // Conservative default: high time-refinement levels still have scattered
+  // CPU consumers outside the RK4/AMR exchange path.
   return lev < trfls_in;
 }
 
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 03b2b73..80e3918 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -437,6 +437,7 @@ static const int STAGE_SLOT_COUNT =
 static constexpr int BSSN_STATE_COUNT = 24;
 static constexpr int BSSN_MATTER_COUNT = 10;
 static constexpr int BSSN_LK_FIELD_COUNT = 24;
+static constexpr int BSSN_RESIDENT_BANK_COUNT = 4;
 
 static const int k_state_input_slots[BSSN_STATE_COUNT] = {
     S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
@@ -512,6 +513,7 @@ struct StepContext {
     double *d_accum_mem;
     double *d_state_curr_mem;
     double *d_state_next_mem;
+    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
     double *d_comm_mem;
     double *h_comm_mem;
@@ -519,6 +521,10 @@ struct StepContext {
     std::array<double *, BSSN_STATE_COUNT> d_accum;
     std::array<double *, BSSN_STATE_COUNT> d_state_curr;
     std::array<double *, BSSN_STATE_COUNT> d_state_next;
+    std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
+    std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> resident_host;
+    std::array<unsigned long long, BSSN_RESIDENT_BANK_COUNT> resident_age;
+    std::array<bool, BSSN_RESIDENT_BANK_COUNT> resident_valid;
     std::array<double *, BSSN_MATTER_COUNT> d_matter;
     size_t cap_all;
     size_t cap_comm;
@@ -526,18 +532,29 @@ struct StepContext {
     size_t cap_h_comm;
     bool matter_ready;
     bool state_ready;
+    int current_bank;
+    unsigned long long resident_clock;
 
     StepContext()
         : d_state0_mem(nullptr), d_accum_mem(nullptr),
           d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
+          d_resident_mem{},
           d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
           cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
-          matter_ready(false), state_ready(false)
+          matter_ready(false), state_ready(false),
+          current_bank(-1), resident_clock(0)
     {
+        d_resident_mem.fill(nullptr);
         d_state0.fill(nullptr);
         d_accum.fill(nullptr);
         d_state_curr.fill(nullptr);
         d_state_next.fill(nullptr);
+        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+            d_resident[b].fill(nullptr);
+            resident_host[b].fill(nullptr);
+        }
+        resident_age.fill(0);
+        resident_valid.fill(false);
         d_matter.fill(nullptr);
     }
 };
@@ -545,8 +562,7 @@ struct StepContext {
 struct StepAllocation {
     double *d_state0_mem;
     double *d_accum_mem;
-    double *d_state_curr_mem;
-    double *d_state_next_mem;
+    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
     double *d_comm_mem;
     double *h_comm_mem;
@@ -563,11 +579,17 @@ static size_t g_comm_segment_meta_cap = 0;
 
 static StepAllocation empty_step_allocation()
 {
-    StepAllocation alloc = {
-        nullptr, nullptr, nullptr, nullptr, nullptr,
-        nullptr, nullptr,
-        0, 0, false, 0
-    };
+    StepAllocation alloc = {};
+    alloc.d_state0_mem = nullptr;
+    alloc.d_accum_mem = nullptr;
+    alloc.d_resident_mem.fill(nullptr);
+    alloc.d_matter_mem = nullptr;
+    alloc.d_comm_mem = nullptr;
+    alloc.h_comm_mem = nullptr;
+    alloc.cap_all = 0;
+    alloc.cap_comm = 0;
+    alloc.h_comm_pinned = false;
+    alloc.cap_h_comm = 0;
     return alloc;
 }
 
@@ -578,16 +600,22 @@ static bool has_step_allocation(const StepAllocation &alloc)
 
 static StepAllocation detach_step_allocation(StepContext &ctx)
 {
-    StepAllocation alloc = {
-        ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem,
-        ctx.d_state_next_mem, ctx.d_matter_mem,
-        ctx.d_comm_mem, ctx.h_comm_mem,
-        ctx.cap_all, ctx.cap_comm, ctx.h_comm_pinned, ctx.cap_h_comm
-    };
+    StepAllocation alloc = {};
+    alloc.d_state0_mem = ctx.d_state0_mem;
+    alloc.d_accum_mem = ctx.d_accum_mem;
+    alloc.d_resident_mem = ctx.d_resident_mem;
+    alloc.d_matter_mem = ctx.d_matter_mem;
+    alloc.d_comm_mem = ctx.d_comm_mem;
+    alloc.h_comm_mem = ctx.h_comm_mem;
+    alloc.cap_all = ctx.cap_all;
+    alloc.cap_comm = ctx.cap_comm;
+    alloc.h_comm_pinned = ctx.h_comm_pinned;
+    alloc.cap_h_comm = ctx.cap_h_comm;
     ctx.d_state0_mem = nullptr;
     ctx.d_accum_mem = nullptr;
     ctx.d_state_curr_mem = nullptr;
     ctx.d_state_next_mem = nullptr;
+    ctx.d_resident_mem.fill(nullptr);
     ctx.d_matter_mem = nullptr;
     ctx.d_comm_mem = nullptr;
     ctx.h_comm_mem = nullptr;
@@ -597,10 +625,18 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     ctx.cap_h_comm = 0;
     ctx.matter_ready = false;
     ctx.state_ready = false;
+    ctx.current_bank = -1;
+    ctx.resident_clock = 0;
     ctx.d_state0.fill(nullptr);
     ctx.d_accum.fill(nullptr);
     ctx.d_state_curr.fill(nullptr);
     ctx.d_state_next.fill(nullptr);
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        ctx.d_resident[b].fill(nullptr);
+        ctx.resident_host[b].fill(nullptr);
+    }
+    ctx.resident_age.fill(0);
+    ctx.resident_valid.fill(false);
     ctx.d_matter.fill(nullptr);
     return alloc;
 }
@@ -609,8 +645,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
 {
     ctx.d_state0_mem = alloc.d_state0_mem;
     ctx.d_accum_mem = alloc.d_accum_mem;
-    ctx.d_state_curr_mem = alloc.d_state_curr_mem;
-    ctx.d_state_next_mem = alloc.d_state_next_mem;
+    ctx.d_resident_mem = alloc.d_resident_mem;
+    ctx.d_state_curr_mem = nullptr;
+    ctx.d_state_next_mem = nullptr;
     ctx.d_matter_mem = alloc.d_matter_mem;
     ctx.d_comm_mem = alloc.d_comm_mem;
     ctx.h_comm_mem = alloc.h_comm_mem;
@@ -620,6 +657,13 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
     ctx.cap_h_comm = alloc.cap_h_comm;
     ctx.matter_ready = false;
     ctx.state_ready = false;
+    ctx.current_bank = -1;
+    ctx.resident_clock = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        ctx.resident_host[b].fill(nullptr);
+    }
+    ctx.resident_age.fill(0);
+    ctx.resident_valid.fill(false);
 }
 
 static void recycle_step_allocation(StepAllocation &alloc)
@@ -708,8 +752,10 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
         if (!has_step_allocation(alloc)) {
             CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
             CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-            CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-            CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+                CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b],
+                                      BSSN_STATE_COUNT * all * sizeof(double)));
+            }
             CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
             alloc.cap_all = all;
         }
@@ -718,8 +764,13 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
         ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
-        ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all;
-        ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all;
+        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+            ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
+        }
+    }
+    if (ctx.current_bank >= 0) {
+        ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
+        ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
     }
     for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
         ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
@@ -4825,6 +4876,263 @@ static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT>
     }
 }
 
+static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
+{
+    if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
+        return false;
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i])
+            return false;
+    }
+    return true;
+}
+
+static int find_resident_bank(const StepContext &ctx, double **host_key)
+{
+    if (!host_key) return -1;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (resident_key_matches(ctx, b, host_key))
+            return b;
+    }
+    return -1;
+}
+
+static bool resident_key_usable(double **host_key)
+{
+    if (!host_key) return false;
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        if (!host_key[i]) return false;
+    }
+    return true;
+}
+
+static void mark_resident_current_bank(StepContext &ctx, int bank)
+{
+    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
+    ctx.current_bank = bank;
+    ctx.d_state_curr_mem = ctx.d_resident_mem[bank];
+    ctx.d_state_curr = ctx.d_resident[bank];
+    ctx.state_ready = ctx.resident_valid[bank];
+}
+
+static void mark_resident_next_bank(StepContext &ctx, int bank)
+{
+    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
+    ctx.d_state_next_mem = ctx.d_resident_mem[bank];
+    ctx.d_state_next = ctx.d_resident[bank];
+}
+
+static bool any_resident_bank_valid(const StepContext &ctx)
+{
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (ctx.resident_valid[b]) return true;
+    }
+    return false;
+}
+
+static void update_state_ready(StepContext &ctx)
+{
+    ctx.state_ready = any_resident_bank_valid(ctx);
+}
+
+static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
+{
+    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
+    if (!ctx.resident_valid[bank]) return;
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        if (!ctx.resident_host[bank][i]) return;
+    }
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i],
+                                   ctx.d_resident[bank][i],
+                                   bytes, cudaMemcpyDeviceToHost));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
+{
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b != avoid_bank && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b == avoid_bank) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0) best = 0;
+    writeback_resident_bank(ctx, best, all);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
+static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
+{
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        ctx.resident_host[bank][i] = host_key[i];
+    }
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+}
+
+static int ensure_resident_bank(StepContext &ctx,
+                                double **host_key,
+                                size_t all,
+                                bool upload_if_missing,
+                                int avoid_bank = -1)
+{
+    if (!resident_key_usable(host_key)) {
+        if (ctx.current_bank >= 0)
+            return ctx.current_bank;
+        return 0;
+    }
+
+    int bank = find_resident_bank(ctx, host_key);
+    if (bank >= 0) {
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        if (!ctx.resident_valid[bank] && upload_if_missing) {
+            bind_state_input_slots(ctx.d_resident[bank]);
+            upload_state_inputs(host_key, all);
+            ctx.resident_valid[bank] = true;
+        }
+        return bank;
+    }
+
+    bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all);
+    assign_resident_key(ctx, bank, host_key);
+    if (upload_if_missing) {
+        bind_state_input_slots(ctx.d_resident[bank]);
+        upload_state_inputs(host_key, all);
+        ctx.resident_valid[bank] = true;
+    } else {
+        ctx.resident_valid[bank] = false;
+    }
+    update_state_ready(ctx);
+    return bank;
+}
+
+static int reserve_resident_output_bank(StepContext &ctx,
+                                        double **host_key,
+                                        size_t all,
+                                        int input_bank)
+{
+    if (!resident_key_usable(host_key))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches(ctx, input_bank, host_key))
+        return input_bank;
+
+    int bank = find_resident_bank(ctx, host_key);
+    if (bank < 0)
+        bank = choose_resident_bank_for_reuse(ctx, input_bank, all);
+    assign_resident_key(ctx, bank, host_key);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
+static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c)
+{
+    return bank == avoid_a || bank == avoid_b || bank == avoid_c;
+}
+
+static int choose_resident_bank_for_reuse_avoiding(StepContext &ctx,
+                                                   int avoid_a,
+                                                   int avoid_b,
+                                                   int avoid_c,
+                                                   size_t all)
+{
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0)
+        return choose_resident_bank_for_reuse(ctx, avoid_a, all);
+
+    writeback_resident_bank(ctx, best, all);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
+static int reserve_resident_output_bank_avoiding(StepContext &ctx,
+                                                 double **host_key,
+                                                 size_t all,
+                                                 int avoid_a,
+                                                 int avoid_b,
+                                                 int avoid_c)
+{
+    if (!resident_key_usable(host_key))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches(ctx, avoid_a, host_key))
+        return avoid_a;
+    if (resident_key_matches(ctx, avoid_b, host_key))
+        return avoid_b;
+    if (resident_key_matches(ctx, avoid_c, host_key))
+        return avoid_c;
+
+    int bank = find_resident_bank(ctx, host_key);
+    if (bank < 0)
+        bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
+    assign_resident_key(ctx, bank, host_key);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
+static int active_or_keyed_bank(StepContext &ctx,
+                                double **host_key,
+                                size_t all,
+                                bool upload_if_missing)
+{
+    if (resident_key_usable(host_key)) {
+        int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
+        mark_resident_current_bank(ctx, bank);
+        return bank;
+    }
+    if (ctx.current_bank >= 0)
+        return ctx.current_bank;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (ctx.resident_valid[b]) {
+            mark_resident_current_bank(ctx, b);
+            return b;
+        }
+    }
+    return 0;
+}
+
 static void launch_rhs_pipeline(int all, double eps, int co)
 {
     const double SYM = 1.0;
@@ -5165,6 +5473,28 @@ static void download_constraint_outputs(double **constraint_host_out, size_t all
     }
 }
 
+__global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
+                                              const double * __restrict__ src2,
+                                              const double * __restrict__ src3,
+                                              double * __restrict__ dst,
+                                              double c1,
+                                              double c2,
+                                              double c3,
+                                              int state_count,
+                                              int all)
+{
+    const int state = blockIdx.y;
+    if (state >= state_count) return;
+    const size_t off = (size_t)state * all;
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < all;
+         i += blockDim.x * gridDim.x)
+    {
+        const double v3 = src3 ? src3[off + i] : 0.0;
+        dst[off + i] = c1 * src1[off + i] + c2 * src2[off + i] + c3 * v3;
+    }
+}
+
 __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
                                              double * __restrict__ dst,
                                              int nx, int ny,
@@ -5416,13 +5746,18 @@ static void copy_state_region_cuda(void *block_tag,
                                    const int *ex,
                                    int i0, int j0, int k0,
                                    int sx, int sy, int sz,
-                                   cudaMemcpyKind kind)
+                                   cudaMemcpyKind kind,
+                                   double **state_host_key = nullptr)
 {
     if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     const size_t pitch = (size_t)ex[0] * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
+                                          kind == cudaMemcpyHostToDevice);
+    double *base_mem = ctx.d_resident_mem[bank];
 
     cudaMemcpy3DParms p = {};
     p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
@@ -5430,13 +5765,19 @@ static void copy_state_region_cuda(void *block_tag,
     p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
 
     if (kind == cudaMemcpyDeviceToHost) {
-        p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
+        p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
         p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
     } else {
         p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
-        p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
+        p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
     }
     CUDA_CHECK(cudaMemcpy3D(&p));
+    if (kind == cudaMemcpyHostToDevice) {
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
+    }
 }
 
 static void copy_state_region_packed_cuda(void *block_tag,
@@ -5445,7 +5786,8 @@ static void copy_state_region_packed_cuda(void *block_tag,
                                           const int *ex,
                                           int i0, int j0, int k0,
                                           int sx, int sy, int sz,
-                                          cudaMemcpyKind kind)
+                                          cudaMemcpyKind kind,
+                                          double **state_host_key = nullptr)
 {
     if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
@@ -5453,23 +5795,33 @@ static void copy_state_region_packed_cuda(void *block_tag,
     const size_t src_pitch = (size_t)ex[0] * sizeof(double);
     const size_t dst_pitch = (size_t)sx * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
+                                          kind == cudaMemcpyHostToDevice);
+    double *base_mem = ctx.d_resident_mem[bank];
 
     cudaMemcpy3DParms p = {};
     p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
 
     if (kind == cudaMemcpyDeviceToHost) {
-        p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
+        p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
         p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
         p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
         p.dstPos = make_cudaPos(0, 0, 0);
     } else {
         p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
         p.srcPos = make_cudaPos(0, 0, 0);
-        p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
+        p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
         p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
     }
 
     CUDA_CHECK(cudaMemcpy3D(&p));
+    if (kind == cudaMemcpyHostToDevice) {
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
+    }
 }
 
 static void copy_state_region_packed_batch_cuda(void *block_tag,
@@ -5478,12 +5830,17 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
                                                 const int *ex,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz,
-                                                cudaMemcpyKind kind)
+                                                cudaMemcpyKind kind,
+                                                double **state_host_key = nullptr)
 {
     if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
+                                          kind == cudaMemcpyHostToDevice);
+    double *base_mem = ctx.d_resident_mem[bank];
     const int region_all = sx * sy * sz;
     const size_t total_doubles = (size_t)state_count * (size_t)region_all;
     double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
@@ -5492,7 +5849,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
         dim3 launch_grid((unsigned int)grid((size_t)region_all),
                          (unsigned int)state_count);
         kern_pack_state_region_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
+            base_mem, d_comm, ex[0], ex[1],
             i0, j0, k0, sx, sy, sz, region_all, state_count,
             ex[0] * ex[1] * ex[2]);
         CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
@@ -5505,9 +5862,13 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
         dim3 launch_grid((unsigned int)grid((size_t)region_all),
                          (unsigned int)state_count);
         kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
+            base_mem, d_comm, ex[0], ex[1],
             i0, j0, k0, sx, sy, sz, region_all, state_count,
             ex[0] * ex[1] * ex[2]);
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
     }
 }
 
@@ -5516,6 +5877,11 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    int bank = find_resident_bank(ctx, state_host_out);
+    if (bank < 0) {
+        bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
+    }
+    mark_resident_current_bank(ctx, bank);
     const bool profile = cuda_profile_enabled();
     const double t0 = profile ? cuda_profile_now_ms() : 0.0;
     static int direct_download = -1;
@@ -5525,7 +5891,7 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     }
     if (direct_download) {
         for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
-            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i],
+            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                        bytes, cudaMemcpyDeviceToHost));
         }
         CUDA_CHECK(cudaDeviceSynchronize());
@@ -5537,7 +5903,7 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
         }
         return;
     }
-    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
+    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
                           (size_t)BSSN_STATE_COUNT * bytes,
                           cudaMemcpyDeviceToHost));
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
@@ -5562,6 +5928,10 @@ static void copy_state_subset(void *block_tag,
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    double **full_key = (subset_count == BSSN_STATE_COUNT) ? state_host : nullptr;
+    const int bank = active_or_keyed_bank(ctx, full_key, all,
+                                          kind == cudaMemcpyHostToDevice);
+    double *base_mem = ctx.d_resident_mem[bank];
     int active_state_indices[BSSN_STATE_COUNT];
     double *active_state_host[BSSN_STATE_COUNT];
     int active_count = 0;
@@ -5586,7 +5956,7 @@ static void copy_state_subset(void *block_tag,
     if (kind == cudaMemcpyDeviceToHost) {
         dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
         kern_pack_state_subset<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, active_count, (int)all);
+            base_mem, d_comm, active_count, (int)all);
         CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
                               total_doubles * sizeof(double),
                               cudaMemcpyDeviceToHost));
@@ -5606,14 +5976,18 @@ static void copy_state_subset(void *block_tag,
                               cudaMemcpyHostToDevice));
         dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
         kern_unpack_state_subset<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, d_comm, active_count, (int)all);
+            base_mem, d_comm, active_count, (int)all);
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
     }
 }
 
 static bool has_resident_state(void *block_tag)
 {
     auto it = g_step_ctx.find(block_tag);
-    return it != g_step_ctx.end() && it->second.state_ready;
+    return it != g_step_ctx.end() && any_resident_bank_valid(it->second);
 }
 
 /* ================================================================== */
@@ -6001,7 +6375,6 @@ int bssn_cuda_rk4_substep(void *block_tag,
                           double &chitiny)
 {
     (void)T;
-    (void)state_host_out;
 
     if (RK4 < 0 || RK4 > 3) return 1;
 
@@ -6031,12 +6404,18 @@ int bssn_cuda_rk4_substep(void *block_tag,
     }
     StepContext &ctx = ensure_step_ctx(block_tag, all);
     const bool use_resident_state = (keep_resident_state != 0);
+    int input_bank = -1;
+    int output_bank = -1;
     if (use_resident_state) {
-        bind_state_input_slots(ctx.d_state_curr);
-        bind_state_output_slots(ctx.d_state_next);
+        input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
+        output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank);
+        mark_resident_current_bank(ctx, input_bank);
+        mark_resident_next_bank(ctx, output_bank);
+        bind_state_input_slots(ctx.d_resident[input_bank]);
+        bind_state_output_slots(ctx.d_resident[output_bank]);
     }
     double t0 = profile ? cuda_profile_now_ms() : 0.0;
-    if (!use_resident_state || !ctx.state_ready) {
+    if (!use_resident_state) {
         upload_state_inputs(state_host_in, all);
     }
     if (apply_enforce_ga) {
@@ -6057,7 +6436,10 @@ int bssn_cuda_rk4_substep(void *block_tag,
         } else {
             upload_matter_cache(ctx, matter_host, all);
         }
-        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
+        const double *state0_src = use_resident_state
+                                 ? ctx.d_resident_mem[input_bank]
+                                 : g_buf.slot[S_chi];
+        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src,
                               (size_t)BSSN_STATE_COUNT * bytes,
                               cudaMemcpyDeviceToDevice));
     } else if (!ctx.matter_ready) {
@@ -6109,9 +6491,10 @@ int bssn_cuda_rk4_substep(void *block_tag,
 
     t0 = profile ? cuda_profile_now_ms() : 0.0;
     if (use_resident_state) {
-        std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
-        ctx.d_state_curr.swap(ctx.d_state_next);
-        ctx.state_ready = true;
+        ctx.resident_valid[output_bank] = true;
+        ctx.resident_age[output_bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, output_bank);
+        update_state_ready(ctx);
     } else {
         download_state_outputs(state_host_out, all);
     }
@@ -6233,6 +6616,24 @@ int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                             double **state_host_key,
+                                                             int state_count,
+                                                             double *host_buffer,
+                                                             int *ex,
+                                                             int i0, int j0, int k0,
+                                                             int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
+                                        i0, j0, k0, sx, sy, sz,
+                                        cudaMemcpyDeviceToHost,
+                                        state_host_key);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                   int state_count,
@@ -6249,34 +6650,61 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
+                                                                 double **state_host_key,
+                                                                 int state_count,
+                                                                 double *host_buffer,
+                                                                 int *ex,
+                                                                 int i0, int j0, int k0,
+                                                                 int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
+                                        i0, j0, k0, sx, sy, sz,
+                                        cudaMemcpyHostToDevice,
+                                        state_host_key);
+    return 0;
+}
+
 static void copy_state_device_batch(void *block_tag,
                                     int state_count,
                                     double *device_buffer,
                                     const int *ex,
                                     int i0, int j0, int k0,
                                     int sx, int sy, int sz,
-                                    int pack_not_unpack)
+                                    int pack_not_unpack,
+                                    double **state_host_key = nullptr)
 {
     if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
+                                          pack_not_unpack == 0);
+    double *base_mem = ctx.d_resident_mem[bank];
     const int region_all = sx * sy * sz;
     dim3 launch_grid((unsigned int)grid((size_t)region_all),
                      (unsigned int)state_count);
 
     if (pack_not_unpack) {
         kern_pack_state_region_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, device_buffer,
+            base_mem, device_buffer,
             ex[0], ex[1], i0, j0, k0, sx, sy, sz,
             region_all, state_count,
             ex[0] * ex[1] * ex[2]);
     } else {
         kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, device_buffer,
+            base_mem, device_buffer,
             ex[0], ex[1], i0, j0, k0, sx, sy, sz,
             region_all, state_count,
             ex[0] * ex[1] * ex[2]);
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
     }
 }
 
@@ -6286,7 +6714,8 @@ static void copy_state_device_segments(void *block_tag,
                                        const int *ex,
                                        int segment_count,
                                        const int *segment_meta,
-                                       int pack_not_unpack)
+                                       int pack_not_unpack,
+                                       double **state_host_key = nullptr)
 {
     if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
     if (segment_count <= 0 || !segment_meta) return;
@@ -6300,6 +6729,10 @@ static void copy_state_device_segments(void *block_tag,
     if (max_region_all <= 0) return;
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
+                                          pack_not_unpack == 0);
+    double *base_mem = ctx.d_resident_mem[bank];
     int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
     CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
                           (size_t)segment_count * 8 * sizeof(int),
@@ -6310,14 +6743,18 @@ static void copy_state_device_segments(void *block_tag,
                      (unsigned int)segment_count);
     if (pack_not_unpack) {
         kern_pack_state_segments_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, device_buffer,
+            base_mem, device_buffer,
             ex[0], ex[1], d_meta, state_count,
             ex[0] * ex[1] * ex[2]);
     } else {
         kern_unpack_state_segments_batch<<<launch_grid, BLK>>>(
-            ctx.d_state_curr_mem, device_buffer,
+            base_mem, device_buffer,
             ex[0], ex[1], d_meta, state_count,
             ex[0] * ex[1] * ex[2]);
+        ctx.resident_valid[bank] = true;
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        mark_resident_current_bank(ctx, bank);
+        update_state_ready(ctx);
     }
 }
 
@@ -6336,6 +6773,22 @@ int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                               double **state_host_key,
+                                                               int state_count,
+                                                               double *device_buffer,
+                                                               int *ex,
+                                                               int i0, int j0, int k0,
+                                                               int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
+                            i0, j0, k0, sx, sy, sz, 1, state_host_key);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                      int state_count,
@@ -6351,6 +6804,22 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
+                                                                   double **state_host_key,
+                                                                   int state_count,
+                                                                   double *device_buffer,
+                                                                   int *ex,
+                                                                   int i0, int j0, int k0,
+                                                                   int sx, int sy, int sz)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
+                            i0, j0, k0, sx, sy, sz, 0, state_host_key);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                    int state_count,
@@ -6366,6 +6835,22 @@ int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int segment_count,
+                                                                  const int *segment_meta)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 1, state_host_key);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                        int state_count,
@@ -6381,6 +6866,22 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
+                                                                      double **state_host_key,
+                                                                      int state_count,
+                                                                      double *device_buffer,
+                                                                      int *ex,
+                                                                      int segment_count,
+                                                                      const int *segment_meta)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 0, state_host_key);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                     int state_count,
@@ -6405,6 +6906,33 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                   double **state_host_key,
+                                                                   int state_count,
+                                                                   double *device_buffer,
+                                                                   int *ex,
+                                                                   int sx, int sy, int sz,
+                                                                   int fi0, int fj0, int fk0)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all, true);
+    const int region_all = sx * sy * sz;
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_resident_mem[bank], device_buffer,
+        ex[0], ex[1], sx, sy, sz,
+        fi0, fj0, fk0, region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
@@ -6431,6 +6959,35 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int sx, int sy, int sz,
+                                                                  int ii0, int jj0, int kk0,
+                                                                  int lbc_i, int lbc_j, int lbc_k)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all, true);
+    const int region_all = sx * sy * sz;
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_resident_mem[bank], device_buffer,
+        ex[0], ex[1], sx, sy, sz,
+        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
+        region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_download_state_subset(void *block_tag,
                                     int *ex,
@@ -6459,6 +7016,71 @@ int bssn_cuda_upload_state_subset(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_prepare_inter_time_level(void *block_tag,
+                                       int *ex,
+                                       double **src1_host_key,
+                                       double **src2_host_key,
+                                       double **src3_host_key,
+                                       double **dst_host_key,
+                                       int source_count,
+                                       int tindex)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (source_count != 2 && source_count != 3) return 1;
+    if (!resident_key_usable(src1_host_key) ||
+        !resident_key_usable(src2_host_key) ||
+        !resident_key_usable(dst_host_key))
+        return 1;
+    if (source_count == 3 && !resident_key_usable(src3_host_key))
+        return 1;
+
+    double c1 = 0.0, c2 = 0.0, c3 = 0.0;
+    if (source_count == 2) {
+        if (tindex == 0) {
+            c1 = 0.5; c2 = 0.5;
+        } else if (tindex == 1) {
+            c1 = 0.75; c2 = 0.25;
+        } else if (tindex == -1) {
+            c1 = 0.25; c2 = 0.75;
+        } else {
+            return 1;
+        }
+    } else {
+        if (tindex == 0) {
+            c1 = 3.0 / 8.0; c2 = 3.0 / 4.0; c3 = -1.0 / 8.0;
+        } else if (tindex == 1 || tindex == -1) {
+            c1 = 5.0 / 32.0; c2 = 15.0 / 16.0; c3 = -3.0 / 32.0;
+        } else {
+            return 1;
+        }
+    }
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    const int src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
+    const int src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
+    const int src3_bank = (source_count == 3)
+                        ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank)
+                        : -1;
+    const int dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
+                                                               src1_bank, src2_bank, src3_bank);
+
+    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
+    kern_prepare_inter_time_level<<<launch_grid, BLK>>>(
+        ctx.d_resident_mem[src1_bank],
+        ctx.d_resident_mem[src2_bank],
+        (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr,
+        ctx.d_resident_mem[dst_bank],
+        c1, c2, c3, BSSN_STATE_COUNT, (int)all);
+    ctx.resident_valid[dst_bank] = true;
+    ctx.resident_age[dst_bank] = ++ctx.resident_clock;
+    mark_resident_current_bank(ctx, dst_bank);
+    update_state_ready(ctx);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_has_resident_state(void *block_tag)
 {
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 5df93a9..fda21d6 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -97,6 +97,14 @@ int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 
+int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                             double **state_host_key,
+                                                             int state_count,
+                                                             double *host_buffer,
+                                                             int *ex,
+                                                             int i0, int j0, int k0,
+                                                             int sx, int sy, int sz);
+
 int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                   int state_count,
                                                   double *host_buffer,
@@ -104,6 +112,14 @@ int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 
+int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
+                                                                 double **state_host_key,
+                                                                 int state_count,
+                                                                 double *host_buffer,
+                                                                 int *ex,
+                                                                 int i0, int j0, int k0,
+                                                                 int sx, int sy, int sz);
+
 int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                 int state_count,
                                                 double *device_buffer,
@@ -111,6 +127,14 @@ int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz);
 
+int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                               double **state_host_key,
+                                                               int state_count,
+                                                               double *device_buffer,
+                                                               int *ex,
+                                                               int i0, int j0, int k0,
+                                                               int sx, int sy, int sz);
+
 int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                     int state_count,
                                                     double *device_buffer,
@@ -118,6 +142,14 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                     int i0, int j0, int k0,
                                                     int sx, int sy, int sz);
 
+int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
+                                                                   double **state_host_key,
+                                                                   int state_count,
+                                                                   double *device_buffer,
+                                                                   int *ex,
+                                                                   int i0, int j0, int k0,
+                                                                   int sx, int sy, int sz);
+
 int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
@@ -125,6 +157,14 @@ int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                    int segment_count,
                                                    const int *segment_meta);
 
+int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int segment_count,
+                                                                  const int *segment_meta);
+
 int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                        int state_count,
                                                        double *device_buffer,
@@ -132,6 +172,14 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                        int segment_count,
                                                        const int *segment_meta);
 
+int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
+                                                                      double **state_host_key,
+                                                                      int state_count,
+                                                                      double *device_buffer,
+                                                                      int *ex,
+                                                                      int segment_count,
+                                                                      const int *segment_meta);
+
 int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                     int state_count,
                                                     double *device_buffer,
@@ -139,6 +187,14 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                     int sx, int sy, int sz,
                                                     int fi0, int fj0, int fk0);
 
+int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                   double **state_host_key,
+                                                                   int state_count,
+                                                                   double *device_buffer,
+                                                                   int *ex,
+                                                                   int sx, int sy, int sz,
+                                                                   int fi0, int fj0, int fk0);
+
 int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
@@ -147,6 +203,15 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                    int ii0, int jj0, int kk0,
                                                    int lbc_i, int lbc_j, int lbc_k);
 
+int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
+                                                                  double **state_host_key,
+                                                                  int state_count,
+                                                                  double *device_buffer,
+                                                                  int *ex,
+                                                                  int sx, int sy, int sz,
+                                                                  int ii0, int jj0, int kk0,
+                                                                  int lbc_i, int lbc_j, int lbc_k);
+
 int bssn_cuda_download_state_subset(void *block_tag,
                                     int *ex,
                                     int subset_count,
@@ -159,6 +224,15 @@ int bssn_cuda_upload_state_subset(void *block_tag,
                                   const int *state_indices,
                                   double **state_host_in);
 
+int bssn_cuda_prepare_inter_time_level(void *block_tag,
+                                       int *ex,
+                                       double **src1_host_key,
+                                       double **src2_host_key,
+                                       double **src3_host_key,
+                                       double **dst_host_key,
+                                       int source_count,
+                                       int tindex);
+
 int bssn_cuda_has_resident_state(void *block_tag);
 
 void bssn_cuda_release_step_ctx(void *block_tag);