Add guarded GPU prolong3 path scaffold

2026-04-09 14:28:36 +08:00
parent e1e3b4a448
commit ad999e4c5a
3 changed files with 312 additions and 6 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1,11 +1,20 @@
-#include "Parallel.h"
+#include "Parallel.h"
 #include "fmisc.h"
 #include "prolongrestrict.h"
 #include "bssn_cuda_ops.h"
 #include "misc.h"
 #include "parameters.h"
 #include <cstring>
 #if defined(__GNUC__) || defined(__clang__)
 extern int bssn_cuda_prolong3_pack(int wei,
                                   const double *llbc, const double *uubc, const int *extc, const double *func,
                                   const double *llbf, const double *uubf, const int *extf, double *funf,
                                   const double *llbp, const double *uubp,
                                   const double *SoA, int symmetry) __attribute__((weak));
 #endif
 namespace {
 const char *g_parallel_transfer_context = "Parallel::transfer";
@@ -3885,11 +3894,18 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
              break;
-            case 3:
+            case 3:
-              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+              if (!bssn_cuda_prolong3_pack ||
-                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                  bssn_cuda_prolong3_pack(DIM,
-                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+                                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-            }
+                                          dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry))
              {
                f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                           dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                           dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
              }
            }
          if (dir == UNPACK) // from target data to corresponding grid
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
                   dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <cuda_runtime.h>
 #include <unordered_map>
 #include <vector>
 namespace {
@@ -191,6 +192,78 @@ __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k
  return sign * f[index3(i, j, k, nx, ny)];
 }
 __device__ inline double load_prolong_cell_padded(const double *f,
                                                  int i, int j, int k,
                                                  int sx, int sy)
 {
  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
 }
 __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
                                     int sxc, int syc,
                                     int nxf, int nyf, int nzf,
                                     int lbc0, int lbc1, int lbc2,
                                     int lbf0, int lbf1, int lbf2,
                                     int ibegin, int iend,
                                     int jbegin, int jend,
                                     int kbegin, int kend)
 {
  const double C1 = 7.7e1 / 8.192e3;
  const double C2 = -6.93e2 / 8.192e3;
  const double C3 = 3.465e3 / 4.096e3;
  const double C4 = 1.155e3 / 4.096e3;
  const double C5 = -4.95e2 / 8.192e3;
  const double C6 = 6.3e1 / 8.192e3;
  const double w_even[6] = {C1, C2, C3, C4, C5, C6};
  const double w_odd[6] = {C6, C5, C4, C3, C2, C1};
  const int n = nxf * nyf * nzf;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxf * nyf;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nxf;
    const int i = rem - j * nxf;
    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
      continue;
    const int ii = i + lbf0;
    const int jj = j + lbf1;
    const int kk = k + lbf2;
    const int ic0 = ii / 2 - lbc0 - 1;
    const int jc0 = jj / 2 - lbc1 - 1;
    const int kc0 = kk / 2 - lbc2 - 1;
    const double *wx = (ii & 1) ? w_odd : w_even;
    const double *wy = (jj & 1) ? w_odd : w_even;
    const double *wz = (kk & 1) ? w_odd : w_even;
    double value = 0.0;
    for (int ox = 0; ox < 6; ++ox)
    {
      const int cx = ic0 + ox;
      double yz = 0.0;
      for (int oy = 0; oy < 6; ++oy)
      {
        const int cy = jc0 + oy;
        double zsum = 0.0;
        for (int oz = 0; oz < 6; ++oz)
        {
          const int cz = kc0 + oz;
          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
        }
        yz += wy[oy] * zsum;
      }
      value += wx[ox] * yz;
    }
    funf[idx] = value;
  }
 }
 __global__ void rk4_kernel(int n, double dT,
                           const double *f0,
                           double *f1,
@@ -697,3 +770,214 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
  }
  return ok ? 0 : 1;
 }
 int bssn_cuda_prolong3_pack(int wei,
                            const double *llbc, const double *uubc, const int *extc, const double *func,
                            const double *llbf, const double *uubf, const int *extf, double *funf,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry)
 {
  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
    return 1;
  // The current input runs with equatorial symmetry enabled.
  // The symmetry-aware prolong CUDA path is not numerically stable yet,
  // so force a safe fallback to the original Fortran implementation.
  if (symmetry != 0)
    return 1;
  auto idint_like = [](double x) -> int {
    return static_cast<int>(std::trunc(x));
  };
  double base[3];
  double CD[3];
  double FD[3];
  int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
  for (int d = 0; d < 3; ++d)
  {
    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
      return 1;
    if (llbc[d] <= llbf[d])
    {
      base[d] = llbc[d];
    }
    else
    {
      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
    }
    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
    lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
    lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
    ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
    ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
    (void)ubc[d];
    (void)ubf[d];
  }
  const int imino = lbp[0] - lbf[0] + 1;
  const int imaxo = ubp[0] - lbf[0] + 1;
  const int jmino = lbp[1] - lbf[1] + 1;
  const int jmaxo = ubp[1] - lbf[1] + 1;
  const int kmino = lbp[2] - lbf[2] + 1;
  const int kmaxo = ubp[2] - lbf[2] + 1;
  const int imini = lbpc[0] - lbc[0] + 1;
  const int imaxi = ubpc[0] - lbc[0] + 1;
  const int jmini = lbpc[1] - lbc[1] + 1;
  const int jmaxi = ubpc[1] - lbc[1] + 1;
  const int kmini = lbpc[2] - lbc[2] + 1;
  const int kmaxi = ubpc[2] - lbc[2] + 1;
  if (imino < 1 || jmino < 1 || kmino < 1 ||
      imini < 1 || jmini < 1 || kmini < 1 ||
      imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
      imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
  {
    return 1;
  }
  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
    int ii = fine_idx + lbf_d - 1;
    return ii / 2 - lbc_d + 1;
  };
  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
  // Current CUDA prolong path only supports the same fast path as the
  // optimized Fortran code: interior stencil access without symmetry_bd().
  // If the stencil touches the symmetry boundary, fall back to Fortran.
  if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
    return 1;
  struct ProlongCache
  {
    CachedBuffer coarse;
    CachedBuffer fine;
  };
  static thread_local ProlongCache cache;
  int nxc = extc[0], nyc = extc[1], nzc = extc[2];
  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
  int sxc = nxc + 3;
  int syc = nyc + 3;
  int szc = nzc + 3;
  int coarse_points = sxc * syc * szc;
  int fine_points = nxf * nyf * nzf;
  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
  std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
  auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
    return static_cast<size_t>(fi + 2) +
           static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
           static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
  };
  auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
    return static_cast<size_t>(i - 1) +
           static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
           static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
  };
  for (int k = 1; k <= nzc; ++k)
  {
    for (int j = 1; j <= nyc; ++j)
    {
      for (int i = 1; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
      }
    }
  }
  for (int offset = 0; offset < 3; ++offset)
  {
    int target_i = -offset;
    int source_i = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int j = 1; j <= nyc; ++j)
      {
        funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
      }
    }
  }
  for (int offset = 0; offset < 3; ++offset)
  {
    int target_j = -offset;
    int source_j = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
      }
    }
  }
  for (int offset = 0; offset < 3; ++offset)
  {
    int target_k = -offset;
    int source_k = offset + 2;
    for (int j = -2; j <= nyc; ++j)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
      }
    }
  }
  double *d_func = nullptr;
  if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
    return 1;
  d_func = cache.coarse.ptr;
  if (!ensure_capacity(cache.fine, fine_bytes))
  {
    return 1;
  }
  dim3 block(256);
  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_func, &d_funf,
                  &sxc, &syc,
                  &nxf, &nyf, &nzf,
                  &lbc[0], &lbc[1], &lbc[2],
                  &lbf[0], &lbf[1], &lbf[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
                  &kbegin, &kend};
  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
    return 1;
  cudaError_t sync_err = cudaDeviceSynchronize();
  if (sync_err != cudaSuccess)
  {
    report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
    return 1;
  }
  cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
    return 1;
  }
  return 0;
 }
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -23,4 +23,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
 int bssn_cuda_prolong3_pack(int wei,
                            const double *llbc, const double *uubc, const int *extc, const double *func,
                            const double *llbf, const double *uubf, const int *extf, double *funf,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry);
 #endif