Cache matter fields in StepContext across RK4 substeps

2026-04-12 22:19:45 +08:00
parent 4fa12a2009
commit 7f2a391dd2
1 changed files with 44 additions and 12 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
 struct StepContext {
    double *d_state0_mem;
    double *d_accum_mem;
    double *d_matter_mem;
    std::array<double *, BSSN_STATE_COUNT> d_state0;
    std::array<double *, BSSN_STATE_COUNT> d_accum;
    std::array<double *, BSSN_MATTER_COUNT> d_matter;
    size_t cap_all;
    bool matter_ready;
 };
 static std::unordered_map<void *, StepContext> g_step_ctx;
@@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
            cudaFree(ctx.d_accum_mem);
            ctx.d_accum_mem = nullptr;
        }
        if (ctx.d_matter_mem) {
            cudaFree(ctx.d_matter_mem);
            ctx.d_matter_mem = nullptr;
        }
        CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
        ctx.cap_all = all;
        ctx.matter_ready = false;
    }
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
        ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
    }
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
    }
    return ctx;
 }
@@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag)
    if (it == g_step_ctx.end()) return;
    if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
    if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
    if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
    g_step_ctx.erase(it);
 }
@@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex,
    CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
 }
-static void upload_state_and_matter(double **state_host,
+static void upload_state_inputs(double **state_host, size_t all)
                                    double **matter_host,
                                    size_t all)
 {
    static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
                  "state + matter upload must match contiguous input slots");
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
    }
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
                    matter_host[i], bytes);
    }
    CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
-                          (size_t)H2D_INPUT_SLOT_COUNT * bytes,
+                          (size_t)BSSN_STATE_COUNT * bytes,
                          cudaMemcpyHostToDevice));
 }
 static void upload_matter_cache(StepContext &ctx,
                                double **matter_host,
                                size_t all)
 {
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
    }
    CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
                          (size_t)BSSN_MATTER_COUNT * bytes,
                          cudaMemcpyHostToDevice));
    ctx.matter_ready = true;
 }
 static void bind_matter_slots(const StepContext &ctx)
 {
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
    }
 }
 static void launch_rhs_pipeline(int all, double eps, int co)
 {
    const double SYM = 1.0;
@@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
    const size_t bytes = all * sizeof(double);
    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
    upload_state_and_matter(state_host_in, matter_host, all);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    upload_state_inputs(state_host_in, all);
    if (RK4 == 0) {
        upload_matter_cache(ctx, matter_host, all);
        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
                              (size_t)BSSN_STATE_COUNT * bytes,
                              cudaMemcpyDeviceToDevice));
    } else if (!ctx.matter_ready) {
        upload_matter_cache(ctx, matter_host, all);
    }
    bind_matter_slots(ctx);
    launch_rhs_pipeline((int)all, eps, co);
@@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag,
    }
    download_state_outputs(state_host_out, all);
    if (RK4 == 3) {
        release_step_ctx(block_tag);
    }
    return 0;
 }