Stabilize EScalar CUDA sync defaults

2026-05-03 00:24:50 +08:00
parent 74ba5feb86
commit 4430d04ee7
6 changed files with 243 additions and 20 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() {
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
-        enabled = (!env || atoi(env) != 0) ? 1 : 0;
+        enabled = (env && atoi(env) != 0) ? 1 : 0;
+    }
+    return enabled != 0;
+}
+
+static bool escalar_gpu_rk_enabled() {
+    static int enabled = -1;
+    if (enabled < 0) {
+        const char *env = getenv("AMSS_ESCALAR_GPU_RK");
+        enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
    return enabled != 0;
 }
@@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
 struct StepContext {
    double *d_state0_mem;
    double *d_accum_mem;
+    double *d_escalar0_mem;
+    double *d_escalar_accum_mem;
    double *d_state_curr_mem;
    double *d_state_next_mem;
    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
@@ -596,6 +607,8 @@ struct StepContext {
    double *h_comm_mem;
    std::array<double *, BSSN_STATE_COUNT> d_state0;
    std::array<double *, BSSN_STATE_COUNT> d_accum;
+    std::array<double *, 2> d_escalar0;
+    std::array<double *, 2> d_escalar_accum;
    std::array<double *, BSSN_STATE_COUNT> d_state_curr;
    std::array<double *, BSSN_STATE_COUNT> d_state_next;
    std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
@@ -615,6 +628,7 @@ struct StepContext {

    StepContext()
        : d_state0_mem(nullptr), d_accum_mem(nullptr),
+          d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr),
          d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
          d_resident_mem{},
          d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
@@ -625,6 +639,8 @@ struct StepContext {
        d_resident_mem.fill(nullptr);
        d_state0.fill(nullptr);
        d_accum.fill(nullptr);
+        d_escalar0.fill(nullptr);
+        d_escalar_accum.fill(nullptr);
        d_state_curr.fill(nullptr);
        d_state_next.fill(nullptr);
        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -641,6 +657,8 @@ struct StepContext {
 struct StepAllocation {
    double *d_state0_mem;
    double *d_accum_mem;
+    double *d_escalar0_mem;
+    double *d_escalar_accum_mem;
    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
    double *d_matter_mem;
    double *d_comm_mem;
@@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation()
    StepAllocation alloc = {};
    alloc.d_state0_mem = nullptr;
    alloc.d_accum_mem = nullptr;
+    alloc.d_escalar0_mem = nullptr;
+    alloc.d_escalar_accum_mem = nullptr;
    alloc.d_resident_mem.fill(nullptr);
    alloc.d_matter_mem = nullptr;
    alloc.d_comm_mem = nullptr;
@@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
    StepAllocation alloc = {};
    alloc.d_state0_mem = ctx.d_state0_mem;
    alloc.d_accum_mem = ctx.d_accum_mem;
+    alloc.d_escalar0_mem = ctx.d_escalar0_mem;
+    alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem;
    alloc.d_resident_mem = ctx.d_resident_mem;
    alloc.d_matter_mem = ctx.d_matter_mem;
    alloc.d_comm_mem = ctx.d_comm_mem;
@@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
    alloc.cap_h_comm = ctx.cap_h_comm;
    ctx.d_state0_mem = nullptr;
    ctx.d_accum_mem = nullptr;
+    ctx.d_escalar0_mem = nullptr;
+    ctx.d_escalar_accum_mem = nullptr;
    ctx.d_state_curr_mem = nullptr;
    ctx.d_state_next_mem = nullptr;
    ctx.d_resident_mem.fill(nullptr);
@@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
    ctx.resident_clock = 0;
    ctx.d_state0.fill(nullptr);
    ctx.d_accum.fill(nullptr);
+    ctx.d_escalar0.fill(nullptr);
+    ctx.d_escalar_accum.fill(nullptr);
    ctx.d_state_curr.fill(nullptr);
    ctx.d_state_next.fill(nullptr);
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
 {
    ctx.d_state0_mem = alloc.d_state0_mem;
    ctx.d_accum_mem = alloc.d_accum_mem;
+    ctx.d_escalar0_mem = alloc.d_escalar0_mem;
+    ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem;
    ctx.d_resident_mem = alloc.d_resident_mem;
    ctx.d_state_curr_mem = nullptr;
    ctx.d_state_next_mem = nullptr;
@@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
            ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
        }
    }
+    if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) {
+        for (int i = 0; i < 2; ++i) {
+            ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
+            ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
+        }
+    }
    if (ctx.current_bank >= 0) {
        ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
        ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
@@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
    return ctx;
 }

+static void ensure_escalar_buffers(StepContext &ctx, size_t all)
+{
+    if (!ctx.d_escalar0_mem)
+        CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double)));
+    if (!ctx.d_escalar_accum_mem)
+        CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double)));
+    for (int i = 0; i < 2; ++i) {
+        ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
+        ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
+    }
+}
+
 static void release_step_ctx(void *block_tag)
 {
    auto it = g_step_ctx.find(block_tag);
@@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
        ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
        ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
        a2);
-    CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
-    CUDA_CHECK(cudaDeviceSynchronize());
+    if (!escalar_gpu_rk_enabled()) {
+        CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
    ctx.matter_ready = true;
    (void)Lev;
    return 0;
 }

+extern "C"
+int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
+                                             int *ex, double *X, double *Y, double *Z,
+                                             double *Sphi_out_host,
+                                             double *Spi_out_host,
+                                             const double *propspeed,
+                                             const double *soa_flat,
+                                             const double *bbox,
+                                             double &dT,
+                                             int &RK4,
+                                             int &apply_bam_bc,
+                                             int &Symmetry,
+                                             int &Lev,
+                                             double &eps,
+                                             int &precor)
+{
+    if (!escalar_gpu_rk_enabled())
+        return 1;
+    if (RK4 < 0 || RK4 > 3)
+        return 1;
+
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor);
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    ensure_escalar_buffers(ctx, all);
+
+    if (RK4 == 0) {
+        CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr],
+                                   bytes, cudaMemcpyDeviceToDevice));
+        CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr],
+                                   bytes, cudaMemcpyDeviceToDevice));
+    }
+
+    if (apply_bam_bc) {
+        gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa],
+                               propspeed[0],
+                               soa_flat[0], soa_flat[1], soa_flat[2],
+                               X, Y, Z, bbox, Symmetry);
+        gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya],
+                               propspeed[1],
+                               soa_flat[3], soa_flat[4], soa_flat[5],
+                               X, Y, Z, bbox, Symmetry);
+    }
+
+    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa],
+                                          ctx.d_escalar_accum[0], dT, RK4);
+    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya],
+                                          ctx.d_escalar_accum[1], dT, RK4);
+
+    try_pin_escalar_host_buffer(Sphi_out_host, bytes);
+    try_pin_escalar_host_buffer(Spi_out_host, bytes);
+    CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    (void)Lev;
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_rk4_substep(void *block_tag,
                          int *ex, double *X, double *Y, double *Z,