Optimize BSSN EScalar GPU path baseline

2026-05-02 18:19:15 +08:00
parent 52beb4d153
commit 59a216ad93
13 changed files with 1366 additions and 177 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -4947,6 +4947,118 @@ static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT>
    }
 }

+__global__ __launch_bounds__(128, 4)
+void kern_escalar_matter_rhs(
+    const double * __restrict__ chi,
+    const double * __restrict__ trK,
+    const double * __restrict__ dxx,
+    const double * __restrict__ gxy,
+    const double * __restrict__ gxz,
+    const double * __restrict__ dyy,
+    const double * __restrict__ gyz,
+    const double * __restrict__ dzz,
+    const double * __restrict__ Gamx,
+    const double * __restrict__ Gamy,
+    const double * __restrict__ Gamz,
+    const double * __restrict__ Lap,
+    const double * __restrict__ Sphi,
+    const double * __restrict__ Spi,
+    const double * __restrict__ chix,
+    const double * __restrict__ chiy,
+    const double * __restrict__ chiz,
+    const double * __restrict__ Lapx,
+    const double * __restrict__ Lapy,
+    const double * __restrict__ Lapz,
+    const double * __restrict__ Kx,
+    const double * __restrict__ Ky,
+    const double * __restrict__ Kz,
+    const double * __restrict__ fxx,
+    const double * __restrict__ fxy,
+    const double * __restrict__ fxz,
+    const double * __restrict__ fyy,
+    const double * __restrict__ fyz,
+    const double * __restrict__ fzz,
+    double * __restrict__ Sphi_rhs,
+    double * __restrict__ Spi_rhs,
+    double * __restrict__ rho,
+    double * __restrict__ Sx,
+    double * __restrict__ Sy,
+    double * __restrict__ Sz,
+    double * __restrict__ Sxx,
+    double * __restrict__ Sxy,
+    double * __restrict__ Sxz,
+    double * __restrict__ Syy,
+    double * __restrict__ Syz,
+    double * __restrict__ Szz,
+    double a2)
+{
+    const double TWO = 2.0;
+    const double HALF = 0.5;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < d_gp.all;
+         i += blockDim.x * gridDim.x)
+    {
+        const double chin1 = chi[i] + 1.0;
+        const double alpn1 = Lap[i] + 1.0;
+        const double gxx = dxx[i] + 1.0;
+        const double gyy = dyy[i] + 1.0;
+        const double gzz = dzz[i] + 1.0;
+        const double lgxy = gxy[i];
+        const double lgxz = gxz[i];
+        const double lgyz = gyz[i];
+
+        double det = gxx * gyy * gzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
+                   - lgxz * gyy * lgxz - lgxy * lgxy * gzz - gxx * lgyz * lgyz;
+        const double gupxx =  (gyy * gzz - lgyz * lgyz) / det;
+        const double gupxy = -(lgxy * gzz - lgyz * lgxz) / det;
+        const double gupxz =  (lgxy * lgyz - gyy * lgxz) / det;
+        const double gupyy =  (gxx * gzz - lgxz * lgxz) / det;
+        const double gupyz = -(gxx * lgyz - lgxy * lgxz) / det;
+        const double gupzz =  (gxx * gyy - lgxy * lgxy) / det;
+
+        double V = 0.0;
+        double dVdSphi = 0.0;
+#if (EScalar_CC == 2 || EScalar_CC == 3)
+        const double sqrt_pi_over_3 = sqrt(PI_VAL / 3.0);
+        const double e4 = exp(4.0 * sqrt_pi_over_3 * Sphi[i]);
+        const double e8n = exp(-8.0 * sqrt_pi_over_3 * Sphi[i]);
+        const double inv_a2 = 1.0 / a2;
+        V = e8n * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_VAL) * inv_a2;
+        dVdSphi = inv_a2 / 12.0 * sqrt(3.0 / PI_VAL) * e8n * (-1.0 + e4);
+#else
+        (void)a2;
+#endif
+
+        Sphi_rhs[i] = alpn1 * Spi[i];
+
+        double srhs = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
+                    + TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
+                    - ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) * HALF / chin1) * Kx[i]
+                    +  (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) * HALF / chin1) * Ky[i]
+                    +  (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) * HALF / chin1) * Kz[i]);
+        srhs = srhs * alpn1
+             + (gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
+             +  gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
+             +  gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i]);
+        Spi_rhs[i] = srhs * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi);
+
+        rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
+                        + gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
+               + HALF * Spi[i] * Spi[i] + V;
+        Sx[i] = -Spi[i] * Kx[i];
+        Sy[i] = -Spi[i] * Ky[i];
+        Sz[i] = -Spi[i] * Kz[i];
+        const double f = (rho[i] - Spi[i] * Spi[i]) / chin1;
+        Sxx[i] = Kx[i] * Kx[i] - f * gxx;
+        Sxy[i] = Kx[i] * Ky[i] - f * lgxy;
+        Sxz[i] = Kx[i] * Kz[i] - f * lgxz;
+        Syy[i] = Ky[i] * Ky[i] - f * gyy;
+        Syz[i] = Ky[i] * Kz[i] - f * lgyz;
+        Szz[i] = Kz[i] * Kz[i] - f * gzz;
+    }
+}
+
 static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
 {
    if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
@@ -6885,6 +6997,97 @@ int f_compute_rhs_bssn(int *ex, double &T,
    return 0;
 }

+extern "C"
+int bssn_cuda_compute_escalar_matter(void *block_tag,
+                                     int *ex, double *X, double *Y, double *Z,
+                                     double **state_host_in,
+                                     double *Sphi_host,
+                                     double *Spi_host,
+                                     double *Sphi_rhs_host,
+                                     double *Spi_rhs_host,
+                                     double a2,
+                                     int &Symmetry,
+                                     int &Lev,
+                                     double &eps,
+                                     int &co,
+                                     int &apply_enforce_ga)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+
+    const int input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
+    mark_resident_current_bank(ctx, input_bank);
+    bind_state_input_slots(ctx.d_resident[input_bank]);
+    if (apply_enforce_ga) {
+        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
+                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
+                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
+                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
+        set_resident_host_clean(ctx, input_bank, false);
+    }
+    CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));
+
+    double *src_fields[3] = {
+        g_buf.slot[S_chi], g_buf.slot[S_Lap], g_buf.slot[S_S_arr]
+    };
+    double *fx_fields[3] = {
+        g_buf.slot[S_chix], g_buf.slot[S_Lapx], g_buf.slot[S_Kx]
+    };
+    double *fy_fields[3] = {
+        g_buf.slot[S_chiy], g_buf.slot[S_Lapy], g_buf.slot[S_Ky]
+    };
+    double *fz_fields[3] = {
+        g_buf.slot[S_chiz], g_buf.slot[S_Lapz], g_buf.slot[S_Kz]
+    };
+    int soa_signs[9] = {
+        1, 1, 1,
+        1, 1, 1,
+        1, 1, 1
+    };
+    gpu_fderivs_batch(3, src_fields, fx_fields, fy_fields, fz_fields,
+                      soa_signs, (int)all);
+
+    double *fd_src[1] = { g_buf.slot[S_S_arr] };
+    double *fxx_fields[1] = { g_buf.slot[S_fxx] };
+    double *fxy_fields[1] = { g_buf.slot[S_fxy] };
+    double *fxz_fields[1] = { g_buf.slot[S_fxz] };
+    double *fyy_fields[1] = { g_buf.slot[S_fyy] };
+    double *fyz_fields[1] = { g_buf.slot[S_fyz] };
+    double *fzz_fields[1] = { g_buf.slot[S_fzz] };
+    gpu_fdderivs_batch(1, fd_src, fxx_fields, fxy_fields, fxz_fields,
+                       fyy_fields, fyz_fields, fzz_fields, soa_signs, (int)all);
+
+    kern_escalar_matter_rhs<<<grid(all), BLK>>>(
+        g_buf.slot[S_chi], g_buf.slot[S_trK],
+        g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
+        g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
+        g_buf.slot[S_Gamx], g_buf.slot[S_Gamy], g_buf.slot[S_Gamz],
+        g_buf.slot[S_Lap],
+        g_buf.slot[S_S_arr], g_buf.slot[S_f_arr],
+        g_buf.slot[S_chix], g_buf.slot[S_chiy], g_buf.slot[S_chiz],
+        g_buf.slot[S_Lapx], g_buf.slot[S_Lapy], g_buf.slot[S_Lapz],
+        g_buf.slot[S_Kx], g_buf.slot[S_Ky], g_buf.slot[S_Kz],
+        g_buf.slot[S_fxx], g_buf.slot[S_fxy], g_buf.slot[S_fxz],
+        g_buf.slot[S_fyy], g_buf.slot[S_fyz], g_buf.slot[S_fzz],
+        g_buf.slot[S_Gamxa], g_buf.slot[S_Gamya],
+        ctx.d_matter[0], ctx.d_matter[1], ctx.d_matter[2], ctx.d_matter[3],
+        ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
+        ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
+        a2);
+    CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ctx.matter_ready = true;
+    (void)Lev;
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_rk4_substep(void *block_tag,
                          int *ex, double *X, double *Y, double *Z,
@@ -6968,6 +7171,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
    if (RK4 == 0) {
        if (use_zero_matter) {
            if (!ctx.matter_ready) zero_matter_cache(ctx, all);
+        } else if (!matter_host) {
+            if (!ctx.matter_ready) return 1;
        } else {
            upload_matter_cache(ctx, matter_host, all);
        }
@@ -6979,7 +7184,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
                              cudaMemcpyDeviceToDevice));
    } else if (!ctx.matter_ready) {
        if (use_zero_matter) zero_matter_cache(ctx, all);
-        else upload_matter_cache(ctx, matter_host, all);
+        else if (matter_host) upload_matter_cache(ctx, matter_host, all);
+        else return 1;
    }
    bind_matter_slots(ctx);
    if (profile) {
@@ -7107,6 +7313,20 @@ int bssn_cuda_download_resident_state_if_present(void *block_tag,
    return 0;
 }

+extern "C"
+int bssn_cuda_resident_state_matches(void *block_tag,
+                                     double **state_host_key)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    auto it = g_step_ctx.find(block_tag);
+    if (it == g_step_ctx.end() || !resident_key_usable(state_host_key))
+        return 0;
+    StepContext &ctx = it->second;
+    const int bank = find_resident_bank(ctx, state_host_key);
+    return (bank >= 0 && ctx.resident_valid[bank]) ? 1 : 0;
+}
+
 extern "C"
 int bssn_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out)