Refine GPU runtime controls and input checker

2026-05-18 01:02:55 +08:00
parent f2264989d8
commit a99534d2f3
8 changed files with 514 additions and 67 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -2792,12 +2792,13 @@ void kern_escalar_sources(
    double * __restrict__ Sxz,
    double * __restrict__ Syy,
    double * __restrict__ Syz,
-    double * __restrict__ Szz)
+    double * __restrict__ Szz,
+    double escalar_a2)
 {
    constexpr double PI_V = 3.141592653589793238462643383279502884;
    constexpr double TWO = 2.0;
    constexpr double HALF = 0.5;
-    constexpr double A2 = 3.0;
+    const double A2 = escalar_a2;

    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
@@ -2852,7 +2853,7 @@ void kern_escalar_sources(
    }
 }

-static void gpu_escalar_sources(int all)
+static void gpu_escalar_sources(int all, double escalar_a2)
 {
    #define D(s) g_buf.slot[s]
    gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
@@ -2872,7 +2873,8 @@ static void gpu_escalar_sources(int all)
        D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
        D(S_Sphi_rhs), D(S_Spi_rhs),
        D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
-        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
+        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
+        escalar_a2);
    #undef D
 }

@@ -6571,7 +6573,8 @@ static int active_or_keyed_bank(StepContext &ctx,
    return 0;
 }

-static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false)
+static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false,
+                                double escalar_a2 = 3.0)
 {
    const double SYM = 1.0;
    const double ANTI = -1.0;
@@ -6652,7 +6655,7 @@ static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escala
        D(S_gupyy), D(S_gupyz), D(S_gupzz));

    if (compute_escalar) {
-        gpu_escalar_sources(all);
+        gpu_escalar_sources(all, escalar_a2);
        gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
    }

@@ -7127,9 +7130,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
 #ifdef fortran3
    set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
 #endif
-    if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) {
-        fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n",
-                escalar_a2);
+    if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) {
+        fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2);
        return 1;
    }

@@ -7187,7 +7189,7 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
        }
    }

-    launch_rhs_pipeline((int)all, eps, co, true);
+    launch_rhs_pipeline((int)all, eps, co, true, escalar_a2);

    if (apply_bam_bc) {
        for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
@@ -7250,7 +7252,7 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
    const size_t bytes = all * sizeof(double);
    setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
    upload_escalar_state_inputs(state_host_in, all);
-    launch_rhs_pipeline((int)all, eps, 0, true);
+    launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2);

    #define D(s) g_buf.slot[s]
    kern_escalar_constraint_fr<<<grid(all), BLK>>>(