add fused symmetry packing kernels for orders 2 and 3 in BSSN RHS

2026-02-28 15:35:14 +08:00
parent 94f40627aa
commit abf2f640e4
1 changed files with 98 additions and 48 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -293,6 +293,41 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
    }
 }

+/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
+__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
+                                     double * __restrict__ fh,
+                                     double SoA0, double SoA1, double SoA2)
+{
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
+    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
+    const int total = fnx * fny * fnz;
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
+         tid < total;
+         tid += blockDim.x * gridDim.x)
+    {
+        int ii = tid % fnx;
+        int jj = (tid / fnx) % fny;
+        int kk = tid / (fnx * fny);
+
+        int iF = ii - 1;   /* -1 .. nx */
+        int jF = jj - 1;   /* -1 .. ny */
+        int kF = kk - 1;   /* -1 .. nz */
+
+        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
+        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
+        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */
+
+        double sign = 1.0;
+        if (iF <= 0) sign *= SoA0;
+        if (jF <= 0) sign *= SoA1;
+        if (kF <= 0) sign *= SoA2;
+
+        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
+        fh[tid] = sign * func[src];
+    }
+}
+
 /* Step 2: Fill i-ghosts (x-direction symmetry) */
 __global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
 {
@@ -386,6 +421,41 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
    }
 }

+/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
+__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
+                                     double * __restrict__ fh,
+                                     double SoA0, double SoA1, double SoA2)
+{
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
+    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
+    const int total = fnx * fny * fnz;
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
+         tid < total;
+         tid += blockDim.x * gridDim.x)
+    {
+        int ii = tid % fnx;
+        int jj = (tid / fnx) % fny;
+        int kk = tid / (fnx * fny);
+
+        int iF = ii - 2;   /* -2 .. nx */
+        int jF = jj - 2;   /* -2 .. ny */
+        int kF = kk - 2;   /* -2 .. nz */
+
+        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
+        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
+        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */
+
+        double sign = 1.0;
+        if (iF <= 0) sign *= SoA0;
+        if (jF <= 0) sign *= SoA1;
+        if (kF <= 0) sign *= SoA2;
+
+        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
+        fh[tid] = sign * func[src];
+    }
+}
+
 __global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
 {
    const int ny = d_gp.ex[1], nz = d_gp.ex[2];
@@ -820,14 +890,9 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 2ull * ny * nz;
-    const size_t w_jghost = 2ull * (nx + 2ull) * nz;
-    const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
+    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);

-    kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
-    kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
 }

@@ -841,14 +906,9 @@ static void gpu_fdderivs(double *d_f,
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 2ull * ny * nz;
-    const size_t w_jghost = 2ull * (nx + 2ull) * nz;
-    const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
+    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);

-    kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
-    kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
 }

@@ -861,14 +921,9 @@ static void gpu_lopsided(double *d_f, double *d_f_rhs,
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 3ull * ny * nz;
-    const size_t w_jghost = 3ull * (nx + 3ull) * nz;
-    const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
+    const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);

-    kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
-    kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
 }

@@ -881,14 +936,9 @@ static void gpu_kodis(double *d_f, double *d_f_rhs,
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 3ull * ny * nz;
-    const size_t w_jghost = 3ull * (nx + 3ull) * nz;
-    const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
+    const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);

-    kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
-    kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
 }