AMSS-NCKU/AMSS_NCKU_source/bssn_cuda_ops.cu

#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"

#include <cmath>
#include <cstdio>
#include <cuda_runtime.h>
#include <unordered_map>
#include <vector>

namespace {

inline void report_cuda_error(const char *where, cudaError_t err)
{
  if (err != cudaSuccess)
    std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
}

inline int count_points(const int ex[3])
{
  return ex[0] * ex[1] * ex[2];
}

inline int div_up(int a, int b)
{
  return (a + b - 1) / b;
}

struct DeviceArrays
{
  double *x = nullptr;
  double *y = nullptr;
  double *z = nullptr;
  double *a = nullptr;
  double *b = nullptr;
  double *c = nullptr;
  double *d = nullptr;
};

struct CachedBuffer
{
  double *ptr = nullptr;
  size_t capacity = 0;
};

struct CachedIntBuffer
{
  int *ptr = nullptr;
  size_t capacity = 0;
};

inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D int)", err);
    return false;
  }
  return true;
}

inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  const double *device_src = bssn_gpu_find_device_buffer(src);
  cudaMemcpyKind kind = cudaMemcpyHostToDevice;
  const void *copy_src = src;
  if (device_src)
  {
    copy_src = device_src;
    kind = cudaMemcpyDeviceToDevice;
  }

  cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
  if (err != cudaSuccess)
  {
    report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

inline int interp_idint_like_host(double x)
{
  return static_cast<int>(x);
}

inline void lagrange_weights_ord6_host(double x, double *w)
{
  static const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0};
  for (int i = 0; i < 6; ++i)
  {
    double num = 1.0;
    for (int j = 0; j < 6; ++j)
    {
      if (j != i)
        num *= (x - static_cast<double>(j));
    }
    w[i] = num / denom[i];
  }
}

inline bool map_interp_index_host(int logical_idx, int n, int *mapped_idx, int *reflected)
{
  int idx = logical_idx;
  *reflected = 0;
  if (idx < 0)
  {
    idx = -idx;
    *reflected = 1;
  }
  if (idx < 0 || idx >= n)
    return false;
  *mapped_idx = idx;
  return true;
}

inline bool compute_interp_window_host(double x, const double *coord, int n,
                                       int ordn, int allow_reflect,
                                       int *start_idx, double *cx)
{
  const double dx = coord[1] - coord[0];
  const int center = interp_idint_like_host((x - coord[0]) / dx + 0.4);
  const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0;
  const int cmax = n - 1;

  int begin = center - ordn / 2 + 1;
  int end = begin + ordn - 1;
  if (begin < cmin)
  {
    begin = cmin;
    end = begin + ordn - 1;
  }
  if (end > cmax)
  {
    end = cmax;
    begin = end + 1 - ordn;
  }

  if (begin >= 0)
    *cx = (x - coord[begin]) / dx;
  else
    *cx = (x + coord[-begin]) / dx;

  *start_idx = begin;
  return (begin >= cmin && end <= cmax);
}

__global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
                                  double *Axx, double *Axy, double *Axz,
                                  double *Ayy, double *Ayz, double *Azz)
{
  const double one = 1.0;
  const double two = 2.0;
  const double one_third = 1.0 / 3.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    double lgxx = dxx[idx] + one;
    double lgyy = dyy[idx] + one;
    double lgzz = dzz[idx] + one;
    double lgxy = gxy[idx];
    double lgxz = gxz[idx];
    double lgyz = gyz[idx];

    double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
               - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
    double scale = 1.0 / cbrt(det);

    lgxx *= scale;
    lgxy *= scale;
    lgxz *= scale;
    lgyy *= scale;
    lgyz *= scale;
    lgzz *= scale;

    dxx[idx] = lgxx - one;
    gxy[idx] = lgxy;
    gxz[idx] = lgxz;
    dyy[idx] = lgyy - one;
    gyz[idx] = lgyz;
    dzz[idx] = lgzz - one;

    double gupxx = (lgyy * lgzz - lgyz * lgyz);
    double gupxy = -(lgxy * lgzz - lgyz * lgxz);
    double gupxz = (lgxy * lgyz - lgyy * lgxz);
    double gupyy = (lgxx * lgzz - lgxz * lgxz);
    double gupyz = -(lgxx * lgyz - lgxy * lgxz);
    double gupzz = (lgxx * lgyy - lgxy * lgxy);

    double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
               + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);

    Axx[idx] -= one_third * lgxx * trA;
    Axy[idx] -= one_third * lgxy * trA;
    Axz[idx] -= one_third * lgxz * trA;
    Ayy[idx] -= one_third * lgyy * trA;
    Ayz[idx] -= one_third * lgyz * trA;
    Azz[idx] -= one_third * lgzz * trA;
  }
}

__device__ inline int index3(int i, int j, int k, int nx, int ny)
{
  return i + j * nx + k * nx * ny;
}

__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
                                            int nx, int ny, int nz,
                                            const double soa[3], int symmetry)
{
  double sign = 1.0;
  if (i < 0)
  {
    i = -i - 1;
    sign *= soa[0];
  }
  if (j < 0)
  {
    j = -j - 1;
    sign *= soa[1];
  }
  if (k < 0)
  {
    k = -k - 1;
    sign *= soa[2];
  }
  if (i >= nx) i = nx - 1;
  if (j >= ny) j = ny - 1;
  if (k >= nz) k = nz - 1;
  (void)symmetry;
  return sign * f[index3(i, j, k, nx, ny)];
}

__device__ inline double load_prolong_cell_padded(const double *f,
                                                  int i, int j, int k,
                                                  int sx, int sy)
{
  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
}

__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
                                     int sxc, int syc,
                                     int nxf, int nyf, int nzf,
                                     int lbc0, int lbc1, int lbc2,
                                     int lbf0, int lbf1, int lbf2,
                                     int ibegin, int iend,
                                     int jbegin, int jend,
                                     int kbegin, int kend)
{
  const double C1 = 7.7e1 / 8.192e3;
  const double C2 = -6.93e2 / 8.192e3;
  const double C3 = 3.465e3 / 4.096e3;
  const double C4 = 1.155e3 / 4.096e3;
  const double C5 = -4.95e2 / 8.192e3;
  const double C6 = 6.3e1 / 8.192e3;
  const double w_even[6] = {C1, C2, C3, C4, C5, C6};
  const double w_odd[6] = {C6, C5, C4, C3, C2, C1};

  const int n = nxf * nyf * nzf;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxf * nyf;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nxf;
    const int i = rem - j * nxf;

    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
      continue;

    const int ii = i + lbf0;
    const int jj = j + lbf1;
    const int kk = k + lbf2;

    const int ic0 = ii / 2 - lbc0 - 1;
    const int jc0 = jj / 2 - lbc1 - 1;
    const int kc0 = kk / 2 - lbc2 - 1;

    const double *wx = (ii & 1) ? w_odd : w_even;
    const double *wy = (jj & 1) ? w_odd : w_even;
    const double *wz = (kk & 1) ? w_odd : w_even;

    double value = 0.0;
    for (int ox = 0; ox < 6; ++ox)
    {
      const int cx = ic0 + ox;
      double yz = 0.0;
      for (int oy = 0; oy < 6; ++oy)
      {
        const int cy = jc0 + oy;
        double zsum = 0.0;
        for (int oz = 0; oz < 6; ++oz)
        {
          const int cz = kc0 + oz;
          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
        }
        yz += wy[oy] * zsum;
      }
      value += wx[ox] * yz;
    }

    funf[idx] = value;
  }
}

__global__ void interp_points_ord6_kernel(int num_points, int num_var,
                                          int nx, int ny,
                                          const double *const *fields,
                                          const double *soa_flat,
                                          const int *stencil_idx,
                                          const int *stencil_reflect,
                                          const double *stencil_weights,
                                          double *out,
                                          int *error_flag)
{
  const int total = num_points * num_var;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x)
  {
    const int point_id = idx / num_var;
    const int var_id = idx - point_id * num_var;
    const double *field = fields[var_id];
    const double *soa = soa_flat + 3 * var_id;
    const int *idxp = stencil_idx + point_id * 18;
    const int *refp = stencil_reflect + point_id * 18;
    const double *wp = stencil_weights + point_id * 18;

    double value = 0.0;
    #pragma unroll
    for (int kz = 0; kz < 6; ++kz)
    {
      const int z_idx = idxp[12 + kz];
      const double z_sign = refp[12 + kz] ? soa[2] : 1.0;
      double yz_sum = 0.0;
      #pragma unroll
      for (int jy = 0; jy < 6; ++jy)
      {
        const int y_idx = idxp[6 + jy];
        const double y_sign = refp[6 + jy] ? soa[1] : 1.0;
        double x_sum = 0.0;
        #pragma unroll
        for (int ix = 0; ix < 6; ++ix)
        {
          const int x_idx = idxp[ix];
          const double x_sign = refp[ix] ? soa[0] : 1.0;
          const double sign = x_sign * y_sign * z_sign;
          x_sum += wp[ix] * sign * field[index3(x_idx, y_idx, z_idx, nx, ny)];
        }
        yz_sum += wp[6 + jy] * x_sum;
      }
      value += wp[12 + kz] * yz_sum;
    }
    out[idx] = value;
  }
  (void)error_flag;
}

__global__ void rk4_kernel(int n, double dT,
                           const double *f0,
                           double *f1,
                           double *rhs,
                           int stage)
{
  const double half = 0.5;
  const double one_sixth = 1.0 / 6.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (stage == 0)
    {
      f1[idx] = f0[idx] + half * dT * rhs[idx];
    }
    else if (stage == 1)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + half * dT * f1[idx];
    }
    else if (stage == 2)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + dT * f1[idx];
    }
    else
    {
      f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
    }
  }
}

__global__ void lowerbound_kernel(int n, double *chi, double tinny)
{
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (chi[idx] < tinny)
      chi[idx] = tinny;
  }
}

__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
                                              int has_xmin, int has_ymin, int has_zmin,
                                              int has_xmax, int has_ymax, int has_zmax,
                                              const double *src, double *dst)
{
  const int n = nx * ny * nz;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1))
    {
      dst[idx] = src[idx];
    }
  }
}

__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
                                      const double *X, const double *Y, const double *Z,
                                      double xmin, double ymin, double zmin,
                                      double xmax, double ymax, double zmax,
                                      int has_xmin, int has_ymin, int has_zmin,
                                      int has_xmax, int has_ymax, int has_zmax,
                                      int imin, int jmin, int kmin,
                                      double propspeed,
                                      const double *f0,
                                      double *target,
                                      double soa0, double soa1, double soa2,
                                      int symmetry)
{
  const double one = 1.0;
  const double two = 2.0;
  const int n = nx * ny * nz;
  const double dX = X[1] - X[0];
  const double dY = Y[1] - Y[0];
  const double dZ = Z[1] - Z[0];
  const double d2dx = one / (two * dX);
  const double d2dy = one / (two * dY);
  const double d2dz = one / (two * dZ);
  const double soa[3] = {soa0, soa1, soa2};

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    const bool on_boundary =
        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1);

    if (!on_boundary)
      continue;

    const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
    if (radius == 0.0)
      continue;

    const double wx = propspeed * X[i] / radius;
    const double wy = propspeed * Y[j] / radius;
    const double wz = propspeed * Z[k] / radius;

    double fx = 0.0;
    double fy = 0.0;
    double fz = 0.0;

    if (wx > 0.0)
    {
      if (i - 2 >= imin)
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
      else if (i - 1 >= imin)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wx < 0.0)
    {
      if (i + 2 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (i + 1 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
    }

    if (wy > 0.0)
    {
      if (j - 2 >= jmin)
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
      else if (j - 1 >= jmin)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wy < 0.0)
    {
      if (j + 2 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (j + 1 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
    }

    if (wz > 0.0)
    {
      if (k - 2 >= kmin)
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
      else if (k - 1 >= kmin)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wz < 0.0)
    {
      if (k + 2 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (k + 1 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
    }

    target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
  }
}

inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
{
  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaLaunchKernel", err);
    return false;
  }
  err = cudaPeekAtLastError();
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaPeekAtLastError", err);
    return false;
  }
  return true;
}

} // namespace

int bssn_cuda_enforce_ga(int *ex,
                         double *dxx, double *gxy, double *gxz,
                         double *dyy, double *gyz, double *dzz,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz)
{
  struct EnforceGaCache
  {
    CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
    CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
  };
  static thread_local EnforceGaCache cache;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
            copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
            copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
            copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
            copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
            copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
            copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
            copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
            copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
            copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
            copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
            copy_to_device_preferring_device(cache.Azz, Azz, bytes);

  if (ok)
  {
    double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
    double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
    double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
    double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
    ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
  }

  if (ok)
  {
    // The next GPU RHS stage consumes these fields immediately.
    // Keep them device-resident and expose the mapping so gpu_rhs() can
    // reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
    bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
    bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
    bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
    bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
    bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
    bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
    bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
    bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
    bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
    bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
    bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
    bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
  }

  return ok ? 0 : 1;
}

int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               const double *X, const double *Y, const double *Z,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
                               double propspeed,
                               const double SoA[3],
                               int symmetry,
                               int lev,
                               int rk_stage,
                               bool download_to_host)
{
  struct Rk4VarCache
  {
    CachedBuffer X, Y, Z;
    CachedBuffer state0, boundary, stage, rhs;
    const double *host_X = nullptr;
    const double *host_Y = nullptr;
    const double *host_Z = nullptr;
    const double *host_state0 = nullptr;
    double *host_rhs = nullptr;
    int nx = 0;
    int ny = 0;
    int nz = 0;
    bool rhs_resident = false;
  };
  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
  Rk4VarCache &cache = cache_map[state0];

  int nx = ex[0];
  int ny = ex[1];
  int nz = ex[2];
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
  const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
  const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  const bool need_bam_boundary = (lev == 0);
  const bool need_coord_copy = need_bam_boundary;
  const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
  const bool need_stage_input = (rk_stage != 0);
  bool ok = true;
  if (need_coord_copy &&
      (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
       cache.nx != nx || cache.ny != ny || cache.nz != nz))
  {
    ok = copy_to_device(cache.X, X, bytes_x) &&
         copy_to_device(cache.Y, Y, bytes_y) &&
         copy_to_device(cache.Z, Z, bytes_z);
    if (ok)
    {
      cache.host_X = X;
      cache.host_Y = Y;
      cache.host_Z = Z;
      cache.nx = nx;
      cache.ny = ny;
      cache.nz = nz;
    }
  }

  const bool refresh_state0 =
      (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
  const bool refresh_rhs =
      (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
  double *stage_ptr = nullptr;
  const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
  const double *mapped_boundary_ptr = need_boundary_input ? bssn_gpu_find_device_buffer(boundary_src) : nullptr;
  const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
  const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;

  if (refresh_state0 && !mapped_state0_ptr)
    bssn_gpu_prepare_host_buffer(state0, n);
  if (need_boundary_input && !mapped_boundary_ptr)
    bssn_gpu_prepare_host_buffer(boundary_src, n);
  if (need_stage_input && !mapped_stage_ptr)
    bssn_gpu_prepare_host_buffer(stage_data, n);
  if (refresh_rhs && !mapped_rhs_ptr)
    bssn_gpu_prepare_host_buffer(rhs_accum, n);

  ok = ok &&
       (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
       (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));

  if (ok && need_stage_input)
  {
    if (mapped_stage_ptr)
    {
      stage_ptr = const_cast<double *>(mapped_stage_ptr);
    }
    else
    {
      ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
      stage_ptr = cache.stage.ptr;
    }
  }
  else if (ok)
  {
    ok = ensure_capacity(cache.stage, bytes);
    stage_ptr = cache.stage.ptr;
  }

  if (!ok)
    return 1;

  if (refresh_state0)
  {
    cache.host_state0 = state0;
    bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
  }
  if (refresh_rhs)
  {
    cache.host_rhs = rhs_accum;
    cache.rhs_resident = true;
    bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
  }

  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
  const int no_symm = 0, eq_symm = 1, octant = 2;
  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
  double soa0 = SoA[0];
  double soa1 = SoA[1];
  double soa2 = SoA[2];

  if (need_bam_boundary)
  {
    int imin = 1;
    int jmin = 1;
    int kmin = 1;
    if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;

    double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
    double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
    double *d_rhs = cache.rhs.ptr;
    double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
                    &xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
                    &has_xmin, &has_ymin, &has_zmin,
                    &has_xmax, &has_ymax, &has_zmax,
                    &imin, &jmin, &kmin, &propspeed,
                    &bam_source, &bam_target,
                    &soa0, &soa1, &soa2,
                    &symmetry};
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

  if (ok)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
  }

  if (ok && lev > 0)
  {
      double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
      void *args[] = {&nx, &ny, &nz,
                      &has_xmin, &has_ymin, &has_zmin,
                      &has_xmax, &has_ymax, &has_zmax,
                      &d_state0, &d_stage};
    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(stage_data, stage_ptr);
    if (download_to_host)
    {
      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
      ok = err == cudaSuccess;
    }
  }

  return ok ? 0 : 1;
}

int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
{
  static thread_local CachedBuffer d_chi;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  double *device_chi = nullptr;
  const double *mapped = bssn_gpu_find_device_buffer(chi);
  bool ok = true;
  if (mapped)
  {
    device_chi = const_cast<double *>(mapped);
  }
  else
  {
    ok = copy_to_device_preferring_device(d_chi, chi, bytes);
    device_chi = d_chi.ptr;
  }

  if (ok)
  {
    double *ptr = device_chi;
    void *args[] = {&n, &ptr, &tinny};
    ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(chi, device_chi);
    if (download_to_host)
    {
      bssn_gpu_prepare_host_buffer(chi, n);
      cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
      ok = err == cudaSuccess;
    }
  }
  return ok ? 0 : 1;
}

int bssn_cuda_download_buffer(int *ex, double *host_ptr)
{
  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
  if (!device_ptr)
    return 1;

  const int n = count_points(ex);
  bssn_gpu_prepare_host_buffer(host_ptr, n);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) buffered download", err);
    return 1;
  }
  return 0;
}

int bssn_cuda_interp_points_batch(const int *ex,
                                  const double *X, const double *Y, const double *Z,
                                  const double *const *fields,
                                  const double *soa_flat,
                                  int num_var,
                                  const double *px, const double *py, const double *pz,
                                  int num_points,
                                  int ordn,
                                  int symmetry,
                                  double *out)
{
  if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out)
    return 1;
  if (num_var <= 0 || num_points <= 0 || ordn != 6)
    return 1;
  if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
    return 1;

  struct InterpBatchCache
  {
    struct StencilCacheEntry
    {
      const double *X;
      const double *Y;
      const double *Z;
      const double *px;
      const double *py;
      const double *pz;
      int nx;
      int ny;
      int nz;
      int num_points;
      int ordn;
      int symmetry;
      CachedBuffer weights;
      CachedIntBuffer indices;
      CachedIntBuffer reflect;

      StencilCacheEntry()
          : X(nullptr), Y(nullptr), Z(nullptr),
            px(nullptr), py(nullptr), pz(nullptr),
            nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {}
    };

    CachedBuffer out;
    CachedBuffer soa;
    CachedBuffer field_ptrs;
    CachedIntBuffer error_flag;
    std::vector<CachedBuffer> host_field_copies;
    std::vector<StencilCacheEntry> stencil_entries;
  };
  static thread_local InterpBatchCache cache;

  const int nx = ex[0];
  const int ny = ex[1];
  const int nz = ex[2];
  const int field_points = count_points(ex);
  const size_t field_bytes = static_cast<size_t>(field_points) * sizeof(double);
  const size_t out_bytes = static_cast<size_t>(num_points) * static_cast<size_t>(num_var) * sizeof(double);
  const size_t soa_bytes = static_cast<size_t>(3 * num_var) * sizeof(double);
  const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
  const size_t point_stencil_doubles = static_cast<size_t>(num_points) * 18;
  const size_t point_stencil_ints = static_cast<size_t>(num_points) * 18;
  const size_t weights_bytes = point_stencil_doubles * sizeof(double);
  const size_t indices_bytes = point_stencil_ints * sizeof(int);

  bool ok = true;
  InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr;
  for (size_t i = 0; i < cache.stencil_entries.size(); ++i)
  {
    InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i];
    if (entry.X == X && entry.Y == Y && entry.Z == Z &&
        entry.px == px && entry.py == py && entry.pz == pz &&
        entry.nx == nx && entry.ny == ny && entry.nz == nz &&
        entry.num_points == num_points && entry.ordn == ordn &&
        entry.symmetry == symmetry)
    {
      stencil_cache = &entry;
      break;
    }
  }

  if (!stencil_cache)
  {
    cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry());
    stencil_cache = &cache.stencil_entries.back();
    stencil_cache->X = X;
    stencil_cache->Y = Y;
    stencil_cache->Z = Z;
    stencil_cache->px = px;
    stencil_cache->py = py;
    stencil_cache->pz = pz;
    stencil_cache->nx = nx;
    stencil_cache->ny = ny;
    stencil_cache->nz = nz;
    stencil_cache->num_points = num_points;
    stencil_cache->ordn = ordn;
    stencil_cache->symmetry = symmetry;

    std::vector<double> host_weights(point_stencil_doubles);
    std::vector<int> host_indices(point_stencil_ints);
    std::vector<int> host_reflect(point_stencil_ints);
    const double dx = X[1] - X[0];
    const double dy = Y[1] - Y[0];
    const double dz = Z[1] - Z[0];
    const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
    const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
    const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
    for (int p = 0; p < num_points; ++p)
    {
      int start_x = 0, start_y = 0, start_z = 0;
      double cx = 0.0, cy = 0.0, cz = 0.0;
      const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
      const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
      const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
      if (!ok_x || !ok_y || !ok_z)
        return 1;

      lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
      lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
      lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);

      for (int i = 0; i < 6; ++i)
      {
        if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
          return 1;
        if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
          return 1;
        if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
          return 1;
      }
    }

    ok = ok &&
         copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) &&
         copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) &&
         copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes);
    if (!ok)
      return 1;
  }

  ok = ok &&
       copy_to_device(cache.soa, soa_flat, soa_bytes) &&
       ensure_capacity(cache.out, out_bytes) &&
       ensure_capacity(cache.field_ptrs, ptr_bytes) &&
       ensure_capacity(cache.error_flag, sizeof(int));
  if (!ok)
    return 1;

  if (static_cast<int>(cache.host_field_copies.size()) < num_var)
    cache.host_field_copies.resize(num_var);

  std::vector<const double *> device_fields(num_var);
  for (int v = 0; v < num_var; ++v)
  {
    const double *device_field = bssn_gpu_find_device_buffer(fields[v]);
    if (!device_field)
    {
      ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes);
      device_field = cache.host_field_copies[v].ptr;
    }
    device_fields[v] = device_field;
    if (!ok || !device_fields[v])
      return 1;
  }

  int zero = 0;
  cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) field_ptrs", err);
    return 1;
  }
  err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err);
    return 1;
  }

  dim3 block(128);
  dim3 grid(div_up(num_points * num_var, static_cast<int>(block.x)));
  if (grid.x > 4096)
    grid.x = 4096;

  int nx_local = nx;
  int ny_local = ny;
  const double *dsoa = cache.soa.ptr;
  const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
  const double *dweights = stencil_cache->weights.ptr;
  const int *dindices = stencil_cache->indices.ptr;
  const int *dreflect = stencil_cache->reflect.ptr;
  double *dout = cache.out.ptr;
  int *derror = cache.error_flag.ptr;

  void *args[] = {
      &num_points, &num_var,
      &nx_local, &ny_local,
      &dfields,
      &dsoa,
      &dindices,
      &dreflect,
      &dweights,
      &dout,
      &derror};
  ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args);
  if (!ok)
    return 1;

  int error_flag = 0;
  err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err);
    return 1;
  }
  if (error_flag != 0)
    return 1;

  err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) interp_out", err);
    return 1;
  }

  return 0;
}

int bssn_cuda_prolong3_pack(int wei,
                            const double *llbc, const double *uubc, const int *extc, const double *func,
                            const double *llbf, const double *uubf, const int *extf, double *funf,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry)
{
  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
    return 1;

  // The symmetry-aware prolong CUDA path is still not equivalent to the
  // active Cell/ghost_width=3 Fortran implementation, so keep the safe
  // fallback for all symmetry-enabled cases.
  if (symmetry != 0)
    return 1;

  auto idint_like = [](double x) -> int {
    return static_cast<int>(std::trunc(x));
  };

  double base[3];
  double CD[3];
  double FD[3];
  int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
  for (int d = 0; d < 3; ++d)
  {
    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
      return 1;

    if (llbc[d] <= llbf[d])
    {
      base[d] = llbc[d];
    }
    else
    {
      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
    }

    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
    lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
    lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
    ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
    ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
    (void)ubc[d];
    (void)ubf[d];
  }

  const int imino = lbp[0] - lbf[0] + 1;
  const int imaxo = ubp[0] - lbf[0] + 1;
  const int jmino = lbp[1] - lbf[1] + 1;
  const int jmaxo = ubp[1] - lbf[1] + 1;
  const int kmino = lbp[2] - lbf[2] + 1;
  const int kmaxo = ubp[2] - lbf[2] + 1;

  const int imini = lbpc[0] - lbc[0] + 1;
  const int imaxi = ubpc[0] - lbc[0] + 1;
  const int jmini = lbpc[1] - lbc[1] + 1;
  const int jmaxi = ubpc[1] - lbc[1] + 1;
  const int kmini = lbpc[2] - lbc[2] + 1;
  const int kmaxi = ubpc[2] - lbc[2] + 1;

  if (imino < 1 || jmino < 1 || kmino < 1 ||
      imini < 1 || jmini < 1 || kmini < 1 ||
      imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
      imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
  {
    return 1;
  }

  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
    int ii = fine_idx + lbf_d - 1;
    return ii / 2 - lbc_d + 1;
  };
  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);

  // Current CUDA prolong path only supports the same fast path as the
  // optimized Fortran code: interior stencil access without symmetry_bd().
  if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
    return 1;

  struct ProlongCache
  {
    CachedBuffer coarse;
    CachedBuffer fine;
  };
  static thread_local ProlongCache cache;

  int nxc = extc[0], nyc = extc[1], nzc = extc[2];
  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
  int sxc = nxc + 3;
  int syc = nyc + 3;
  int szc = nzc + 3;
  int coarse_points = sxc * syc * szc;
  int fine_points = nxf * nyf * nzf;
  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);

  std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
  auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
    return static_cast<size_t>(fi + 2) +
           static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
           static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
  };
  auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
    return static_cast<size_t>(i - 1) +
           static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
           static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
  };

  for (int k = 1; k <= nzc; ++k)
  {
    for (int j = 1; j <= nyc; ++j)
    {
      for (int i = 1; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_i = -offset;
    int source_i = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int j = 1; j <= nyc; ++j)
      {
        funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_j = -offset;
    int source_j = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_k = -offset;
    int source_k = offset + 2;
    for (int j = -2; j <= nyc; ++j)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
      }
    }
  }

  double *d_func = nullptr;
  if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
    return 1;
  d_func = cache.coarse.ptr;

  if (!ensure_capacity(cache.fine, fine_bytes))
  {
    return 1;
  }

  dim3 block(256);
  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_func, &d_funf,
                  &sxc, &syc,
                  &nxf, &nyf, &nzf,
                  &lbc[0], &lbc[1], &lbc[2],
                  &lbf[0], &lbf[1], &lbf[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
                  &kbegin, &kend};

  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
    return 1;

  cudaError_t sync_err = cudaDeviceSynchronize();
  if (sync_err != cudaSuccess)
  {
    report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
    return 1;
  }

  cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
    return 1;
  }
  return 0;
}