diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 15b5ba5..04f2dd0 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
 struct StepContext {
     double *d_state0_mem;
     double *d_accum_mem;
+    double *d_matter_mem;
     std::array<double *, BSSN_STATE_COUNT> d_state0;
     std::array<double *, BSSN_STATE_COUNT> d_accum;
+    std::array<double *, BSSN_MATTER_COUNT> d_matter;
     size_t cap_all;
+    bool matter_ready;
 };
 
 static std::unordered_map<void *, StepContext> g_step_ctx;
@@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
             cudaFree(ctx.d_accum_mem);
             ctx.d_accum_mem = nullptr;
         }
+        if (ctx.d_matter_mem) {
+            cudaFree(ctx.d_matter_mem);
+            ctx.d_matter_mem = nullptr;
+        }
         CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
         CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+        CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
         ctx.cap_all = all;
+        ctx.matter_ready = false;
     }
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
         ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
     }
+    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
+        ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
+    }
     return ctx;
 }
 
@@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag)
     if (it == g_step_ctx.end()) return;
     if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
     if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
+    if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
     g_step_ctx.erase(it);
 }
 
@@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex,
     CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
 }
 
-static void upload_state_and_matter(double **state_host,
-                                    double **matter_host,
-                                    size_t all)
+static void upload_state_inputs(double **state_host, size_t all)
 {
-    static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
-                  "state + matter upload must match contiguous input slots");
     const size_t bytes = all * sizeof(double);
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
     }
-    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
-        std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
-                    matter_host[i], bytes);
-    }
     CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
-                          (size_t)H2D_INPUT_SLOT_COUNT * bytes,
+                          (size_t)BSSN_STATE_COUNT * bytes,
                           cudaMemcpyHostToDevice));
 }
 
+static void upload_matter_cache(StepContext &ctx,
+                                double **matter_host,
+                                size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
+        std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
+    }
+    CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
+                          (size_t)BSSN_MATTER_COUNT * bytes,
+                          cudaMemcpyHostToDevice));
+    ctx.matter_ready = true;
+}
+
+static void bind_matter_slots(const StepContext &ctx)
+{
+    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
+        g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
+    }
+}
+
 static void launch_rhs_pipeline(int all, double eps, int co)
 {
     const double SYM = 1.0;
@@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
     const size_t bytes = all * sizeof(double);
 
     setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
-    upload_state_and_matter(state_host_in, matter_host, all);
-
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    upload_state_inputs(state_host_in, all);
     if (RK4 == 0) {
+        upload_matter_cache(ctx, matter_host, all);
         CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
                               (size_t)BSSN_STATE_COUNT * bytes,
                               cudaMemcpyDeviceToDevice));
+    } else if (!ctx.matter_ready) {
+        upload_matter_cache(ctx, matter_host, all);
     }
+    bind_matter_slots(ctx);
 
     launch_rhs_pipeline((int)all, eps, co);
 
@@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag,
     }
 
     download_state_outputs(state_host_out, all);
+    if (RK4 == 3) {
+        release_step_ctx(block_tag);
+    }
     return 0;
 }