From ad999e4c5a4a27b0d4774c3c0da412ee33d2b8bd Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Thu, 9 Apr 2026 14:28:36 +0800
Subject: [PATCH] Add guarded GPU prolong3 path scaffold

---
 AMSS_NCKU_source/Parallel.C       |  28 ++-
 AMSS_NCKU_source/bssn_cuda_ops.cu | 284 ++++++++++++++++++++++++++++++
 AMSS_NCKU_source/bssn_cuda_ops.h  |   6 +
 3 files changed, 312 insertions(+), 6 deletions(-)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 59db417..3ecb14b 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1,11 +1,20 @@
 
-#include "Parallel.h"
+#include "Parallel.h"
 #include "fmisc.h"
 #include "prolongrestrict.h"
+#include "bssn_cuda_ops.h"
 #include "misc.h"
 #include "parameters.h"
 #include <cstring>
 
+#if defined(__GNUC__) || defined(__clang__)
+extern int bssn_cuda_prolong3_pack(int wei,
+                                   const double *llbc, const double *uubc, const int *extc, const double *func,
+                                   const double *llbf, const double *uubf, const int *extf, double *funf,
+                                   const double *llbp, const double *uubp,
+                                   const double *SoA, int symmetry) __attribute__((weak));
+#endif
+
 namespace {
 
 const char *g_parallel_transfer_context = "Parallel::transfer";
@@ -3885,11 +3894,18 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
                           src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                           dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
               break;
-            case 3:
-              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-            }
+            case 3:
+              if (!bssn_cuda_prolong3_pack ||
+                  bssn_cuda_prolong3_pack(DIM,
+                                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                                          dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry))
+              {
+                f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                           dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                           dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+              }
+            }
           if (dir == UNPACK) // from target data to corresponding grid
             f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
                    dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu
index 7101227..68eb9ba 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <cuda_runtime.h>
 #include <unordered_map>
+#include <vector>
 
 namespace {
 
@@ -191,6 +192,78 @@ __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k
   return sign * f[index3(i, j, k, nx, ny)];
 }
 
+__device__ inline double load_prolong_cell_padded(const double *f,
+                                                  int i, int j, int k,
+                                                  int sx, int sy)
+{
+  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
+}
+
+__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
+                                     int sxc, int syc,
+                                     int nxf, int nyf, int nzf,
+                                     int lbc0, int lbc1, int lbc2,
+                                     int lbf0, int lbf1, int lbf2,
+                                     int ibegin, int iend,
+                                     int jbegin, int jend,
+                                     int kbegin, int kend)
+{
+  const double C1 = 7.7e1 / 8.192e3;
+  const double C2 = -6.93e2 / 8.192e3;
+  const double C3 = 3.465e3 / 4.096e3;
+  const double C4 = 1.155e3 / 4.096e3;
+  const double C5 = -4.95e2 / 8.192e3;
+  const double C6 = 6.3e1 / 8.192e3;
+  const double w_even[6] = {C1, C2, C3, C4, C5, C6};
+  const double w_odd[6] = {C6, C5, C4, C3, C2, C1};
+
+  const int n = nxf * nyf * nzf;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    const int plane = nxf * nyf;
+    const int k = idx / plane;
+    const int rem = idx - k * plane;
+    const int j = rem / nxf;
+    const int i = rem - j * nxf;
+
+    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
+      continue;
+
+    const int ii = i + lbf0;
+    const int jj = j + lbf1;
+    const int kk = k + lbf2;
+
+    const int ic0 = ii / 2 - lbc0 - 1;
+    const int jc0 = jj / 2 - lbc1 - 1;
+    const int kc0 = kk / 2 - lbc2 - 1;
+
+    const double *wx = (ii & 1) ? w_odd : w_even;
+    const double *wy = (jj & 1) ? w_odd : w_even;
+    const double *wz = (kk & 1) ? w_odd : w_even;
+
+    double value = 0.0;
+    for (int ox = 0; ox < 6; ++ox)
+    {
+      const int cx = ic0 + ox;
+      double yz = 0.0;
+      for (int oy = 0; oy < 6; ++oy)
+      {
+        const int cy = jc0 + oy;
+        double zsum = 0.0;
+        for (int oz = 0; oz < 6; ++oz)
+        {
+          const int cz = kc0 + oz;
+          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
+        }
+        yz += wy[oy] * zsum;
+      }
+      value += wx[ox] * yz;
+    }
+
+    funf[idx] = value;
+  }
+}
+
 __global__ void rk4_kernel(int n, double dT,
                            const double *f0,
                            double *f1,
@@ -697,3 +770,214 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
   }
   return ok ? 0 : 1;
 }
+
+int bssn_cuda_prolong3_pack(int wei,
+                            const double *llbc, const double *uubc, const int *extc, const double *func,
+                            const double *llbf, const double *uubf, const int *extf, double *funf,
+                            const double *llbp, const double *uubp,
+                            const double *SoA, int symmetry)
+{
+  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
+    return 1;
+
+  // The current input runs with equatorial symmetry enabled.
+  // The symmetry-aware prolong CUDA path is not numerically stable yet,
+  // so force a safe fallback to the original Fortran implementation.
+  if (symmetry != 0)
+    return 1;
+
+  auto idint_like = [](double x) -> int {
+    return static_cast<int>(std::trunc(x));
+  };
+
+  double base[3];
+  double CD[3];
+  double FD[3];
+  int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
+  for (int d = 0; d < 3; ++d)
+  {
+    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
+    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
+    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
+      return 1;
+
+    if (llbc[d] <= llbf[d])
+    {
+      base[d] = llbc[d];
+    }
+    else
+    {
+      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
+      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
+    }
+
+    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
+    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
+    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
+    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
+    lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
+    lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
+    ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
+    ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
+    (void)ubc[d];
+    (void)ubf[d];
+  }
+
+  const int imino = lbp[0] - lbf[0] + 1;
+  const int imaxo = ubp[0] - lbf[0] + 1;
+  const int jmino = lbp[1] - lbf[1] + 1;
+  const int jmaxo = ubp[1] - lbf[1] + 1;
+  const int kmino = lbp[2] - lbf[2] + 1;
+  const int kmaxo = ubp[2] - lbf[2] + 1;
+
+  const int imini = lbpc[0] - lbc[0] + 1;
+  const int imaxi = ubpc[0] - lbc[0] + 1;
+  const int jmini = lbpc[1] - lbc[1] + 1;
+  const int jmaxi = ubpc[1] - lbc[1] + 1;
+  const int kmini = lbpc[2] - lbc[2] + 1;
+  const int kmaxi = ubpc[2] - lbc[2] + 1;
+
+  if (imino < 1 || jmino < 1 || kmino < 1 ||
+      imini < 1 || jmini < 1 || kmini < 1 ||
+      imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
+      imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
+  {
+    return 1;
+  }
+
+  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
+    int ii = fine_idx + lbf_d - 1;
+    return ii / 2 - lbc_d + 1;
+  };
+  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
+  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
+  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
+
+  // Current CUDA prolong path only supports the same fast path as the
+  // optimized Fortran code: interior stencil access without symmetry_bd().
+  // If the stencil touches the symmetry boundary, fall back to Fortran.
+  if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
+    return 1;
+
+  struct ProlongCache
+  {
+    CachedBuffer coarse;
+    CachedBuffer fine;
+  };
+  static thread_local ProlongCache cache;
+
+  int nxc = extc[0], nyc = extc[1], nzc = extc[2];
+  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
+  int sxc = nxc + 3;
+  int syc = nyc + 3;
+  int szc = nzc + 3;
+  int coarse_points = sxc * syc * szc;
+  int fine_points = nxf * nyf * nzf;
+  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
+  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
+
+  std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
+  auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
+    return static_cast<size_t>(fi + 2) +
+           static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
+           static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
+  };
+  auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
+    return static_cast<size_t>(i - 1) +
+           static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
+           static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
+  };
+
+  for (int k = 1; k <= nzc; ++k)
+  {
+    for (int j = 1; j <= nyc; ++j)
+    {
+      for (int i = 1; i <= nxc; ++i)
+      {
+        funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
+      }
+    }
+  }
+
+  for (int offset = 0; offset < 3; ++offset)
+  {
+    int target_i = -offset;
+    int source_i = offset + 2;
+    for (int k = 1; k <= nzc; ++k)
+    {
+      for (int j = 1; j <= nyc; ++j)
+      {
+        funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
+      }
+    }
+  }
+
+  for (int offset = 0; offset < 3; ++offset)
+  {
+    int target_j = -offset;
+    int source_j = offset + 2;
+    for (int k = 1; k <= nzc; ++k)
+    {
+      for (int i = -2; i <= nxc; ++i)
+      {
+        funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
+      }
+    }
+  }
+
+  for (int offset = 0; offset < 3; ++offset)
+  {
+    int target_k = -offset;
+    int source_k = offset + 2;
+    for (int j = -2; j <= nyc; ++j)
+    {
+      for (int i = -2; i <= nxc; ++i)
+      {
+        funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
+      }
+    }
+  }
+
+  double *d_func = nullptr;
+  if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
+    return 1;
+  d_func = cache.coarse.ptr;
+
+  if (!ensure_capacity(cache.fine, fine_bytes))
+  {
+    return 1;
+  }
+
+  dim3 block(256);
+  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
+  double *d_funf = cache.fine.ptr;
+  int ibegin = imino - 1, iend = imaxo - 1;
+  int jbegin = jmino - 1, jend = jmaxo - 1;
+  int kbegin = kmino - 1, kend = kmaxo - 1;
+  void *args[] = {&d_func, &d_funf,
+                  &sxc, &syc,
+                  &nxf, &nyf, &nzf,
+                  &lbc[0], &lbc[1], &lbc[2],
+                  &lbf[0], &lbf[1], &lbf[2],
+                  &ibegin, &iend,
+                  &jbegin, &jend,
+                  &kbegin, &kend};
+
+  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
+    return 1;
+
+  cudaError_t sync_err = cudaDeviceSynchronize();
+  if (sync_err != cudaSuccess)
+  {
+    report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
+    return 1;
+  }
+
+  cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
+    return 1;
+  }
+  return 0;
+}
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h
index e45bcaa..080ae95 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -23,4 +23,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
 
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
 
+int bssn_cuda_prolong3_pack(int wei,
+                            const double *llbc, const double *uubc, const int *extc, const double *func,
+                            const double *llbf, const double *uubf, const int *extf, double *funf,
+                            const double *llbp, const double *uubp,
+                            const double *SoA, int symmetry);
+
 #endif