From abf2f640e4e46f54d6cff03a4838f39d4786cdf3 Mon Sep 17 00:00:00 2001
From: ianchb <i@4t.pw>
Date: Sat, 28 Feb 2026 15:35:14 +0800
Subject: [PATCH] add fused symmetry packing kernels for orders 2 and 3 in BSSN
 RHS

---
 AMSS_NCKU_source/bssn_rhs_cuda.cu | 146 ++++++++++++++++++++----------
 1 file changed, 98 insertions(+), 48 deletions(-)

diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 94b073a..9d072c6 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -274,10 +274,10 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
 /*  A. Symmetry boundary kernels (ord=2 and ord=3)                     */
 /* ================================================================== */
 
-/* Step 1: Copy interior into ghost-padded array */
-__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
-                                               double * __restrict__ fh,
-                                               double SoA0, double SoA1, double SoA2)
+/* Step 1: Copy interior into ghost-padded array */
+__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
+                                               double * __restrict__ fh,
+                                               double SoA0, double SoA1, double SoA2)
 {
     const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
     const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
@@ -290,12 +290,47 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
         int k0 = tid / (nx * ny);
         int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
         fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
-    }
-}
-
-/* Step 2: Fill i-ghosts (x-direction symmetry) */
-__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
-{
+    }
+}
+
+/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
+__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
+                                     double * __restrict__ fh,
+                                     double SoA0, double SoA1, double SoA2)
+{
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
+    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
+    const int total = fnx * fny * fnz;
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
+         tid < total;
+         tid += blockDim.x * gridDim.x)
+    {
+        int ii = tid % fnx;
+        int jj = (tid / fnx) % fny;
+        int kk = tid / (fnx * fny);
+
+        int iF = ii - 1;   /* -1 .. nx */
+        int jF = jj - 1;   /* -1 .. ny */
+        int kF = kk - 1;   /* -1 .. nz */
+
+        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
+        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
+        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */
+
+        double sign = 1.0;
+        if (iF <= 0) sign *= SoA0;
+        if (jF <= 0) sign *= SoA1;
+        if (kF <= 0) sign *= SoA2;
+
+        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
+        fh[tid] = sign * func[src];
+    }
+}
+
+/* Step 2: Fill i-ghosts (x-direction symmetry) */
+__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
+{
     const int ny = d_gp.ex[1], nz = d_gp.ex[2];
     const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
     /* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
@@ -369,9 +404,9 @@ __global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
 
 /* ---- ord=3 variants (for lopsided / kodis) ---- */
 
-__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
-                                               double * __restrict__ fh)
-{
+__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
+                                               double * __restrict__ fh)
+{
     const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
     const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
     for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -383,11 +418,46 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
         int k0 = tid / (nx * ny);
         int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
         fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
-    }
-}
-
-__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
-{
+    }
+}
+
+/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
+__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
+                                     double * __restrict__ fh,
+                                     double SoA0, double SoA1, double SoA2)
+{
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
+    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
+    const int total = fnx * fny * fnz;
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
+         tid < total;
+         tid += blockDim.x * gridDim.x)
+    {
+        int ii = tid % fnx;
+        int jj = (tid / fnx) % fny;
+        int kk = tid / (fnx * fny);
+
+        int iF = ii - 2;   /* -2 .. nx */
+        int jF = jj - 2;   /* -2 .. ny */
+        int kF = kk - 2;   /* -2 .. nz */
+
+        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
+        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
+        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */
+
+        double sign = 1.0;
+        if (iF <= 0) sign *= SoA0;
+        if (jF <= 0) sign *= SoA1;
+        if (kF <= 0) sign *= SoA2;
+
+        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
+        fh[tid] = sign * func[src];
+    }
+}
+
+__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
+{
     const int ny = d_gp.ex[1], nz = d_gp.ex[2];
     const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
     int total = ny * nz;
@@ -813,21 +883,16 @@ static inline int grid(size_t n) {
 }
 
 /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
-static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
-                        double SoA0, double SoA1, double SoA2, int all)
+static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
+                        double SoA0, double SoA1, double SoA2, int all)
 {
     double *fh = g_buf.d_fh2;
     const size_t nx = (size_t)g_buf.prev_nx;
     const size_t ny = (size_t)g_buf.prev_ny;
     const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 2ull * ny * nz;
-    const size_t w_jghost = 2ull * (nx + 2ull) * nz;
-    const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
+    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
 
-    kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
-    kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
     kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
 }
 
@@ -841,14 +906,9 @@ static void gpu_fdderivs(double *d_f,
     const size_t nx = (size_t)g_buf.prev_nx;
     const size_t ny = (size_t)g_buf.prev_ny;
     const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 2ull * ny * nz;
-    const size_t w_jghost = 2ull * (nx + 2ull) * nz;
-    const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
+    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
 
-    kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
-    kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
     kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
 }
 
@@ -861,14 +921,9 @@ static void gpu_lopsided(double *d_f, double *d_f_rhs,
     const size_t nx = (size_t)g_buf.prev_nx;
     const size_t ny = (size_t)g_buf.prev_ny;
     const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 3ull * ny * nz;
-    const size_t w_jghost = 3ull * (nx + 3ull) * nz;
-    const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
+    const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
 
-    kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
-    kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
     kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
 }
 
@@ -881,14 +936,9 @@ static void gpu_kodis(double *d_f, double *d_f_rhs,
     const size_t nx = (size_t)g_buf.prev_nx;
     const size_t ny = (size_t)g_buf.prev_ny;
     const size_t nz = (size_t)g_buf.prev_nz;
-    const size_t w_ighost = 3ull * ny * nz;
-    const size_t w_jghost = 3ull * (nx + 3ull) * nz;
-    const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
+    const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
 
-    kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
-    kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
-    kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
-    kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
+    kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
     kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
 }