Stabilize GPU AMR prolong/restrict paths

2026-04-10 21:57:58 +08:00
parent c578a15ecd
commit a089041c3b
3 changed files with 453 additions and 55 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -14,6 +14,11 @@ extern int bssn_cuda_prolong3_pack(int wei,
                                   const double *llbf, const double *uubf, const int *extf, double *funf,
                                   const double *llbp, const double *uubp,
                                   const double *SoA, int symmetry) __attribute__((weak));
+extern int bssn_cuda_restrict3_pack(int wei,
+                                    const double *llbc, const double *uubc, const int *extc, double *func,
+                                    const double *llbf, const double *uubf, const int *extf, const double *funf,
+                                    const double *llbr, const double *uubr,
+                                    const double *SoA, int symmetry) __attribute__((weak));
 #endif

 namespace {
@@ -4048,11 +4053,18 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                     dst->data->llb, dst->data->uub);
              break;
-            case 2:
-              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-              break;
+            case 2:
+              if (!bssn_cuda_restrict3_pack ||
+                  bssn_cuda_restrict3_pack(DIM,
+                                           dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                                           src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                                           dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry))
+              {
+                f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
+                            src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
+                            dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+              }
+              break;
            case 3:
              if (!bssn_cuda_prolong3_pack ||
                  bssn_cuda_prolong3_pack(DIM,
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -423,14 +423,23 @@ __device__ inline double load_prolong_cell_padded(const double *f,
  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
 }

+__device__ inline double load_restrict_cell_padded(const double *f,
+                                                   int i, int j, int k,
+                                                   int sx, int sy)
+{
+  return f[(i + 1) + (j + 1) * sx + (k + 1) * sx * sy];
+}
+
 __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
                                     int sxc, int syc,
+                                     int nxc, int nyc, int nzc,
                                     int nxf, int nyf, int nzf,
                                     int lbc0, int lbc1, int lbc2,
                                     int lbf0, int lbf1, int lbf2,
                                     int ibegin, int iend,
                                     int jbegin, int jend,
-                                     int kbegin, int kend)
+                                     int kbegin, int kend,
+                                     int *error_flag)
 {
  const double C1 = 7.7e1 / 8.192e3;
  const double C2 = -6.93e2 / 8.192e3;
@@ -477,6 +486,15 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
        for (int oz = 0; oz < 6; ++oz)
        {
          const int cz = kc0 + oz;
+          if (cx < -2 || cx > nxc || cy < -2 || cy > nyc || cz < -2 || cz > nzc)
+          {
+            if (error_flag)
+              *error_flag = 1;
+            zsum = 0.0;
+            yz = 0.0;
+            value = 0.0;
+            goto prolong3_cell_kernel_next_point;
+          }
          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
        }
        yz += wy[oy] * zsum;
@@ -485,6 +503,74 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
    }

    funf[idx] = value;
+prolong3_cell_kernel_next_point:
+    ;
+  }
+}
+
+__global__ void restrict3_cell_kernel(const double *funf, double *func,
+                                      int sxf, int syf,
+                                      int nxf, int nyf, int nzf,
+                                      int nxc, int nyc, int nzc,
+                                      int lbc0, int lbc1, int lbc2,
+                                      int lbf0, int lbf1, int lbf2,
+                                      int ibegin, int iend,
+                                      int jbegin, int jend,
+                                      int kbegin, int kend,
+                                      int *error_flag)
+{
+  const double C1 = 3.0 / 256.0;
+  const double C2 = -25.0 / 256.0;
+  const double C3 = 75.0 / 128.0;
+  const double w[6] = {C1, C2, C3, C3, C2, C1};
+
+  const int n = nxc * nyc * nzc;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    const int plane = nxc * nyc;
+    const int k = idx / plane;
+    const int rem = idx - k * plane;
+    const int j = rem / nxc;
+    const int i = rem - j * nxc;
+
+    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
+      continue;
+
+    const int fi = 2 * (i + lbc0) - lbf0;
+    const int fj = 2 * (j + lbc1) - lbf1;
+    const int fk = 2 * (k + lbc2) - lbf2;
+
+    double value = 0.0;
+    for (int ox = 0; ox < 6; ++ox)
+    {
+      const int fx = fi - 2 + ox;
+      double yz = 0.0;
+      for (int oy = 0; oy < 6; ++oy)
+      {
+        const int fy = fj - 2 + oy;
+        double zsum = 0.0;
+        for (int oz = 0; oz < 6; ++oz)
+        {
+          const int fz = fk - 2 + oz;
+          if (fx < -1 || fx > nxf || fy < -1 || fy > nyf || fz < -1 || fz > nzf)
+          {
+            if (error_flag)
+              *error_flag = 1;
+            zsum = 0.0;
+            yz = 0.0;
+            value = 0.0;
+            goto restrict3_cell_kernel_next_point;
+          }
+          zsum += w[oz] * load_restrict_cell_padded(funf, fx, fy, fz, sxf, syf);
+        }
+        yz += w[oy] * zsum;
+      }
+      value += w[ox] * yz;
+    }
+
+    func[idx] = value;
+restrict3_cell_kernel_next_point:
+    ;
  }
 }

@@ -1287,13 +1373,14 @@ int bssn_cuda_prolong3_pack(int wei,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry)
 {
+  const int no_symm = 0;
+  const int eq_symm = 1;
+
  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
    return 1;

-  // The symmetry-aware prolong CUDA path is still not equivalent to the
-  // active Cell/ghost_width=3 Fortran implementation, so keep the safe
-  // fallback for all symmetry-enabled cases.
-  if (symmetry != 0)
+  // Keep the CPU fallback for octant and any unrecognized symmetry modes.
+  if (symmetry != no_symm && symmetry != eq_symm)
    return 1;

  auto idint_like = [](double x) -> int {
@@ -1355,28 +1442,55 @@ int bssn_cuda_prolong3_pack(int wei,
    return 1;
  }

+  // Keep thin prolongation slabs on the established CPU path for now.
+  // The GPU kernel is only enabled for chunkier regions where the current
+  // device pack path is known to be safe.
+  if (extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
+    return 1;
+
  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
    int ii = fine_idx + lbf_d - 1;
    return ii / 2 - lbc_d + 1;
  };
  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
+  const int ic_max = coarse_center_index(imaxo, lbf[0], lbc[0]);
  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
+  const int jc_max = coarse_center_index(jmaxo, lbf[1], lbc[1]);
  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
+  const int kc_max = coarse_center_index(kmaxo, lbf[2], lbc[2]);
+  const bool need_equatorial_z_reflect = (symmetry == eq_symm && kc_min - 2 < 1);

-  // Current CUDA prolong path only supports the same fast path as the
-  // optimized Fortran code: interior stencil access without symmetry_bd().
-  if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
+  // Match the active Fortran implementation's later stencil safety check.
+  if (ic_max + 3 > extc[0] || jc_max + 3 > extc[1] || kc_max + 3 > extc[2] ||
+      ic_min < 1 || jc_min < 1 || kc_min < 0)
+  {
+    return 1;
+  }
+
+  // Match the active Cell-centered Fortran fast path: only the z<0
+  // reflection needed by equatorial symmetry is handled here.
+  if (ic_min - 2 < 1 || jc_min - 2 < 1 || (kc_min - 2 < 1 && !need_equatorial_z_reflect))
    return 1;

  struct ProlongCache
  {
    CachedBuffer coarse;
    CachedBuffer fine;
+    CachedIntBuffer error_flag;
  };
  static thread_local ProlongCache cache;

-  int nxc = extc[0], nyc = extc[1], nzc = extc[2];
+  int src_nxc = extc[0], src_nyc = extc[1], src_nzc = extc[2];
  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
+  const int stage_ix_lo = ic_min - 2;
+  const int stage_ix_hi = ic_max + 3;
+  const int stage_iy_lo = jc_min - 2;
+  const int stage_iy_hi = jc_max + 3;
+  const int stage_iz_lo = need_equatorial_z_reflect ? 1 : (kc_min - 2);
+  const int stage_iz_hi = kc_max + 3;
+  int nxc = stage_ix_hi - stage_ix_lo + 1;
+  int nyc = stage_iy_hi - stage_iy_lo + 1;
+  int nzc = stage_iz_hi - stage_iz_lo + 1;
  int sxc = nxc + 3;
  int syc = nyc + 3;
  int szc = nzc + 3;
@@ -1391,58 +1505,39 @@ int bssn_cuda_prolong3_pack(int wei,
           static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
           static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
  };
-  auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
+  auto func_index = [src_nxc, src_nyc](int i, int j, int k) -> size_t {
    return static_cast<size_t>(i - 1) +
-           static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
-           static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
+           static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxc) +
+           static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxc) * static_cast<size_t>(src_nyc);
  };

-  for (int k = 1; k <= nzc; ++k)
+  for (int k = stage_iz_lo; k <= stage_iz_hi; ++k)
  {
-    for (int j = 1; j <= nyc; ++j)
+    const int lk = k - stage_iz_lo + 1;
+    for (int j = stage_iy_lo; j <= stage_iy_hi; ++j)
    {
-      for (int i = 1; i <= nxc; ++i)
+      const int lj = j - stage_iy_lo + 1;
+      for (int i = stage_ix_lo; i <= stage_ix_hi; ++i)
      {
-        funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
+        const int li = i - stage_ix_lo + 1;
+        funcc_host[coarse_index(li, lj, lk)] = func[func_index(i, j, k)];
      }
    }
  }

-  for (int offset = 0; offset < 3; ++offset)
+  if (need_equatorial_z_reflect)
  {
-    int target_i = -offset;
-    int source_i = offset + 2;
-    for (int k = 1; k <= nzc; ++k)
+    for (int offset = 0; offset < 3; ++offset)
    {
+      const int target_k = -offset;
+      const int source_k = offset + 1;
      for (int j = 1; j <= nyc; ++j)
      {
-        funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
-      }
-    }
-  }
-
-  for (int offset = 0; offset < 3; ++offset)
-  {
-    int target_j = -offset;
-    int source_j = offset + 2;
-    for (int k = 1; k <= nzc; ++k)
-    {
-      for (int i = -2; i <= nxc; ++i)
-      {
-        funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
-      }
-    }
-  }
-
-  for (int offset = 0; offset < 3; ++offset)
-  {
-    int target_k = -offset;
-    int source_k = offset + 2;
-    for (int j = -2; j <= nyc; ++j)
-    {
-      for (int i = -2; i <= nxc; ++i)
-      {
-        funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
+        for (int i = 1; i <= nxc; ++i)
+        {
+          funcc_host[coarse_index(i, j, target_k)] =
+              funcc_host[coarse_index(i, j, source_k)] * SoA[2];
+        }
      }
    }
  }
@@ -1456,21 +1551,38 @@ int bssn_cuda_prolong3_pack(int wei,
  {
    return 1;
  }
+  if (!ensure_capacity(cache.error_flag, sizeof(int)))
+  {
+    return 1;
+  }
+
+  int zero = 0;
+  cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D) prolong3_error_flag", err);
+    return 1;
+  }

  dim3 block(256);
  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
+  int lbc_local[3] = {lbc[0] + stage_ix_lo - 1,
+                      lbc[1] + stage_iy_lo - 1,
+                      lbc[2] + stage_iz_lo - 1};
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_func, &d_funf,
                  &sxc, &syc,
+                  &nxc, &nyc, &nzc,
                  &nxf, &nyf, &nzf,
-                  &lbc[0], &lbc[1], &lbc[2],
+                  &lbc_local[0], &lbc_local[1], &lbc_local[2],
                  &lbf[0], &lbf[1], &lbf[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
-                  &kbegin, &kend};
+                  &kbegin, &kend,
+                  &cache.error_flag.ptr};

  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
    return 1;
@@ -1478,11 +1590,33 @@ int bssn_cuda_prolong3_pack(int wei,
  cudaError_t sync_err = cudaDeviceSynchronize();
  if (sync_err != cudaSuccess)
  {
+    std::fprintf(stderr,
+                 "prolong3 debug: symmetry=%d extc=(%d,%d,%d) extf=(%d,%d,%d) "
+                 "imino=%d imaxo=%d jmino=%d jmaxo=%d kmino=%d kmaxo=%d "
+                 "ic_min=%d ic_max=%d jc_min=%d jc_max=%d kc_min=%d kc_max=%d "
+                 "lbc=(%d,%d,%d) lbf=(%d,%d,%d)\n",
+                 symmetry,
+                 extc[0], extc[1], extc[2],
+                 extf[0], extf[1], extf[2],
+                 imino, imaxo, jmino, jmaxo, kmino, kmaxo,
+                 ic_min, ic_max, jc_min, jc_max, kc_min, kc_max,
+                 lbc[0], lbc[1], lbc[2],
+                 lbf[0], lbf[1], lbf[2]);
    report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
    return 1;
  }

-  cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
+  int host_error_flag = 0;
+  err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(D2H) prolong3_error_flag", err);
+    return 1;
+  }
+  if (host_error_flag)
+    return 1;
+
+  err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
@@ -1490,3 +1624,249 @@ int bssn_cuda_prolong3_pack(int wei,
  }
  return 0;
 }
+
+int bssn_cuda_restrict3_pack(int wei,
+                             const double *llbc, const double *uubc, const int *extc, double *func,
+                             const double *llbf, const double *uubf, const int *extf, const double *funf,
+                             const double *llbr, const double *uubr,
+                             const double *SoA, int symmetry)
+{
+  const int no_symm = 0;
+  const int eq_symm = 1;
+
+  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbr || !uubr || !SoA)
+    return 1;
+
+  if (symmetry != no_symm && symmetry != eq_symm)
+    return 1;
+
+  auto idint_like = [](double x) -> int {
+    return static_cast<int>(std::trunc(x));
+  };
+
+  double base[3];
+  double CD[3];
+  double FD[3];
+  int lbc[3], ubc[3], lbf[3], ubf[3], lbr_i[3], ubr_i[3], lbrf[3], ubrf[3];
+  for (int d = 0; d < 3; ++d)
+  {
+    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
+    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
+    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
+      return 1;
+
+    if (llbc[d] <= llbf[d])
+    {
+      base[d] = llbc[d];
+    }
+    else
+    {
+      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
+      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
+    }
+
+    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
+    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
+    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
+    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
+    lbr_i[d] = idint_like((llbr[d] - base[d]) / CD[d] + 0.4) + 1;
+    lbrf[d] = idint_like((llbr[d] - base[d]) / FD[d] + 0.4) + 1;
+    ubr_i[d] = idint_like((uubr[d] - base[d]) / CD[d] + 0.4);
+    ubrf[d] = idint_like((uubr[d] - base[d]) / FD[d] + 0.4);
+    (void)ubc[d];
+    (void)ubf[d];
+  }
+
+  const int imino = lbr_i[0] - lbc[0] + 1;
+  const int imaxo = ubr_i[0] - lbc[0] + 1;
+  const int jmino = lbr_i[1] - lbc[1] + 1;
+  const int jmaxo = ubr_i[1] - lbc[1] + 1;
+  const int kmino = lbr_i[2] - lbc[2] + 1;
+  const int kmaxo = ubr_i[2] - lbc[2] + 1;
+
+  const int imini = lbrf[0] - lbf[0] + 1;
+  const int imaxi = ubrf[0] - lbf[0] + 1;
+  const int jmini = lbrf[1] - lbf[1] + 1;
+  const int jmaxi = ubrf[1] - lbf[1] + 1;
+  const int kmini = lbrf[2] - lbf[2] + 1;
+  const int kmaxi = ubrf[2] - lbf[2] + 1;
+
+  if (imino < 1 || jmino < 1 || kmino < 1 ||
+      imini < 1 || jmini < 1 || kmini < 1 ||
+      imaxo > extc[0] || jmaxo > extc[1] || kmaxo > extc[2] ||
+      imaxi > extf[0] - 2 || jmaxi > extf[1] - 2 || kmaxi > extf[2] - 2)
+  {
+    return 1;
+  }
+
+  // Keep thin restrict slabs on the established CPU path for now.
+  if (extc[0] <= 2 * wei || extc[1] <= 2 * wei || extc[2] <= 2 * wei ||
+      extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
+  {
+    return 1;
+  }
+
+  const int fi_min = 2 * (imino + lbc[0] - 1) - lbf[0];
+  const int fi_max = 2 * (imaxo + lbc[0] - 1) - lbf[0];
+  const int fj_min = 2 * (jmino + lbc[1] - 1) - lbf[1];
+  const int fj_max = 2 * (jmaxo + lbc[1] - 1) - lbf[1];
+  const int fk_min = 2 * (kmino + lbc[2] - 1) - lbf[2];
+  const int fk_max = 2 * (kmaxo + lbc[2] - 1) - lbf[2];
+
+  const int ii_lo = fi_min - 2;
+  const int ii_hi = fi_max + 3;
+  const int jj_lo = fj_min - 2;
+  const int jj_hi = fj_max + 3;
+  const int kk_lo = fk_min - 2;
+  const int kk_hi = fk_max + 3;
+
+  if (ii_hi > extf[0] || jj_hi > extf[1] || kk_hi > extf[2] ||
+      ii_lo < -1 || jj_lo < -1 || kk_lo < -1)
+  {
+    return 1;
+  }
+
+  const bool need_equatorial_z_reflect = (symmetry == eq_symm && kk_lo < 1);
+  // Keep symmetry-reflected restrict windows on the CPU path for now.
+  if (ii_lo < 1 || jj_lo < 1 || kk_lo < 1)
+    return 1;
+
+  struct RestrictCache
+  {
+    CachedBuffer fine;
+    CachedBuffer coarse;
+    CachedIntBuffer error_flag;
+  };
+  static thread_local RestrictCache cache;
+
+  int nxc = extc[0];
+  int nyc = extc[1];
+  int nzc = extc[2];
+  int src_nxf = extf[0];
+  int src_nyf = extf[1];
+  int src_nzf = extf[2];
+  const int stage_fx_lo = ii_lo;
+  const int stage_fx_hi = ii_hi;
+  const int stage_fy_lo = jj_lo;
+  const int stage_fy_hi = jj_hi;
+  const int stage_fz_lo = kk_lo;
+  const int stage_fz_hi = kk_hi;
+  int nxf = stage_fx_hi - stage_fx_lo + 1;
+  int nyf = stage_fy_hi - stage_fy_lo + 1;
+  int nzf = stage_fz_hi - stage_fz_lo + 1;
+  int sxf = nxf + 2;
+  int syf = nyf + 2;
+  int szf = nzf + 2;
+  int fine_points = sxf * syf * szf;
+  int coarse_points = nxc * nyc * nzc;
+  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
+  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
+
+  std::vector<double> funff_host(static_cast<size_t>(fine_points), 0.0);
+  auto fine_index = [sxf, syf](int fi, int fj, int fk) -> size_t {
+    return static_cast<size_t>(fi + 1) +
+           static_cast<size_t>(fj + 1) * static_cast<size_t>(sxf) +
+           static_cast<size_t>(fk + 1) * static_cast<size_t>(sxf) * static_cast<size_t>(syf);
+  };
+  auto src_index = [src_nxf, src_nyf](int i, int j, int k) -> size_t {
+    return static_cast<size_t>(i - 1) +
+           static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxf) +
+           static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxf) * static_cast<size_t>(src_nyf);
+  };
+
+  for (int k = stage_fz_lo; k <= stage_fz_hi; ++k)
+  {
+    const int lk = k - stage_fz_lo + 1;
+    for (int j = stage_fy_lo; j <= stage_fy_hi; ++j)
+    {
+      const int lj = j - stage_fy_lo + 1;
+      for (int i = stage_fx_lo; i <= stage_fx_hi; ++i)
+      {
+        const int li = i - stage_fx_lo + 1;
+        funff_host[fine_index(li, lj, lk)] = funf[src_index(i, j, k)];
+      }
+    }
+  }
+
+  (void)need_equatorial_z_reflect;
+
+  if (!copy_to_device(cache.fine, funff_host.data(), fine_bytes))
+    return 1;
+  if (!ensure_capacity(cache.coarse, coarse_bytes))
+    return 1;
+  if (!ensure_capacity(cache.error_flag, sizeof(int)))
+    return 1;
+
+  int zero = 0;
+  cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D) restrict3_error_flag", err);
+    return 1;
+  }
+
+  dim3 block(256);
+  dim3 grid(div_up(coarse_points, static_cast<int>(block.x)));
+  double *d_funf = cache.fine.ptr;
+  double *d_func = cache.coarse.ptr;
+  int lbf_local[3] = {lbf[0] + stage_fx_lo - 1,
+                      lbf[1] + stage_fy_lo - 1,
+                      lbf[2] + stage_fz_lo - 1};
+  int ibegin = imino - 1, iend = imaxo - 1;
+  int jbegin = jmino - 1, jend = jmaxo - 1;
+  int kbegin = kmino - 1, kend = kmaxo - 1;
+  void *args[] = {&d_funf, &d_func,
+                  &sxf, &syf,
+                  &nxf, &nyf, &nzf,
+                  &nxc, &nyc, &nzc,
+                  &lbc[0], &lbc[1], &lbc[2],
+                  &lbf_local[0], &lbf_local[1], &lbf_local[2],
+                  &ibegin, &iend,
+                  &jbegin, &jend,
+                  &kbegin, &kend,
+                  &cache.error_flag.ptr};
+
+  if (!launch_kernel(grid, block, (const void *)restrict3_cell_kernel, args))
+    return 1;
+
+  cudaError_t sync_err = cudaDeviceSynchronize();
+  if (sync_err != cudaSuccess)
+  {
+    std::fprintf(stderr,
+                 "restrict3 debug: symmetry=%d extc=(%d,%d,%d) extf=(%d,%d,%d) "
+                 "imino=%d imaxo=%d jmino=%d jmaxo=%d kmino=%d kmaxo=%d "
+                 "imini=%d imaxi=%d jmini=%d jmaxi=%d kmini=%d kmaxi=%d "
+                 "lbc=(%d,%d,%d) lbf=(%d,%d,%d) "
+                 "fi=[%d,%d] fj=[%d,%d] fk=[%d,%d] window=[%d:%d,%d:%d,%d:%d]\n",
+                 symmetry,
+                 extc[0], extc[1], extc[2],
+                 extf[0], extf[1], extf[2],
+                 imino, imaxo, jmino, jmaxo, kmino, kmaxo,
+                 imini, imaxi, jmini, jmaxi, kmini, kmaxi,
+                 lbc[0], lbc[1], lbc[2],
+                 lbf[0], lbf[1], lbf[2],
+                 fi_min, fi_max, fj_min, fj_max, fk_min, fk_max,
+                 ii_lo, ii_hi, jj_lo, jj_hi, kk_lo, kk_hi);
+    report_cuda_error("cudaDeviceSynchronize restrict3", sync_err);
+    return 1;
+  }
+
+  int host_error_flag = 0;
+  err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(D2H) restrict3_error_flag", err);
+    return 1;
+  }
+  if (host_error_flag)
+    return 1;
+
+  err = cudaMemcpy(func, cache.coarse.ptr, coarse_bytes, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(D2H) restrict3", err);
+    return 1;
+  }
+
+  return 0;
+}
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -33,6 +33,12 @@ int bssn_cuda_prolong3_pack(int wei,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry);

+int bssn_cuda_restrict3_pack(int wei,
+                             const double *llbc, const double *uubc, const int *extc, double *func,
+                             const double *llbf, const double *uubf, const int *extf, const double *funf,
+                             const double *llbr, const double *uubr,
+                             const double *SoA, int symmetry);
+
 int bssn_cuda_interp_points_batch(const int *ex,
                                  const double *X, const double *Y, const double *Z,
                                  const double *const *fields,