AMSS-NCKU/AMSS_NCKU_source/bssn_cuda_ops.cu

#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"

#include <cmath>
#include <cstdio>
#include <cuda_runtime.h>
#include <unordered_map>
#include <vector>

namespace {

inline void report_cuda_error(const char *where, cudaError_t err)
{
  if (err != cudaSuccess)
    std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
}

inline int count_points(const int ex[3])
{
  return ex[0] * ex[1] * ex[2];
}

inline int div_up(int a, int b)
{
  return (a + b - 1) / b;
}

struct DeviceArrays
{
  double *x = nullptr;
  double *y = nullptr;
  double *z = nullptr;
  double *a = nullptr;
  double *b = nullptr;
  double *c = nullptr;
  double *d = nullptr;
};

struct CachedBuffer
{
  double *ptr = nullptr;
  size_t capacity = 0;
};

inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  const double *device_src = bssn_gpu_find_device_buffer(src);
  cudaMemcpyKind kind = cudaMemcpyHostToDevice;
  const void *copy_src = src;
  if (device_src)
  {
    copy_src = device_src;
    kind = cudaMemcpyDeviceToDevice;
  }

  cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
  if (err != cudaSuccess)
  {
    report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

__global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
                                  double *Axx, double *Axy, double *Axz,
                                  double *Ayy, double *Ayz, double *Azz)
{
  const double one = 1.0;
  const double two = 2.0;
  const double one_third = 1.0 / 3.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    double lgxx = dxx[idx] + one;
    double lgyy = dyy[idx] + one;
    double lgzz = dzz[idx] + one;
    double lgxy = gxy[idx];
    double lgxz = gxz[idx];
    double lgyz = gyz[idx];

    double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
               - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
    double scale = 1.0 / cbrt(det);

    lgxx *= scale;
    lgxy *= scale;
    lgxz *= scale;
    lgyy *= scale;
    lgyz *= scale;
    lgzz *= scale;

    dxx[idx] = lgxx - one;
    gxy[idx] = lgxy;
    gxz[idx] = lgxz;
    dyy[idx] = lgyy - one;
    gyz[idx] = lgyz;
    dzz[idx] = lgzz - one;

    double gupxx = (lgyy * lgzz - lgyz * lgyz);
    double gupxy = -(lgxy * lgzz - lgyz * lgxz);
    double gupxz = (lgxy * lgyz - lgyy * lgxz);
    double gupyy = (lgxx * lgzz - lgxz * lgxz);
    double gupyz = -(lgxx * lgyz - lgxy * lgxz);
    double gupzz = (lgxx * lgyy - lgxy * lgxy);

    double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
               + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);

    Axx[idx] -= one_third * lgxx * trA;
    Axy[idx] -= one_third * lgxy * trA;
    Axz[idx] -= one_third * lgxz * trA;
    Ayy[idx] -= one_third * lgyy * trA;
    Ayz[idx] -= one_third * lgyz * trA;
    Azz[idx] -= one_third * lgzz * trA;
  }
}

__device__ inline int index3(int i, int j, int k, int nx, int ny)
{
  return i + j * nx + k * nx * ny;
}

__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
                                            int nx, int ny, int nz,
                                            const double soa[3], int symmetry)
{
  double sign = 1.0;
  if (i < 0)
  {
    i = -i - 1;
    sign *= soa[0];
  }
  if (j < 0)
  {
    j = -j - 1;
    sign *= soa[1];
  }
  if (k < 0)
  {
    k = -k - 1;
    sign *= soa[2];
  }
  if (i >= nx) i = nx - 1;
  if (j >= ny) j = ny - 1;
  if (k >= nz) k = nz - 1;
  (void)symmetry;
  return sign * f[index3(i, j, k, nx, ny)];
}

__device__ inline double load_prolong_cell_padded(const double *f,
                                                  int i, int j, int k,
                                                  int sx, int sy)
{
  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
}

__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
                                     int sxc, int syc,
                                     int nxf, int nyf, int nzf,
                                     int lbc0, int lbc1, int lbc2,
                                     int lbf0, int lbf1, int lbf2,
                                     int ibegin, int iend,
                                     int jbegin, int jend,
                                     int kbegin, int kend)
{
  const double C1 = 7.7e1 / 8.192e3;
  const double C2 = -6.93e2 / 8.192e3;
  const double C3 = 3.465e3 / 4.096e3;
  const double C4 = 1.155e3 / 4.096e3;
  const double C5 = -4.95e2 / 8.192e3;
  const double C6 = 6.3e1 / 8.192e3;
  const double w_even[6] = {C1, C2, C3, C4, C5, C6};
  const double w_odd[6] = {C6, C5, C4, C3, C2, C1};

  const int n = nxf * nyf * nzf;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxf * nyf;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nxf;
    const int i = rem - j * nxf;

    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
      continue;

    const int ii = i + lbf0;
    const int jj = j + lbf1;
    const int kk = k + lbf2;

    const int ic0 = ii / 2 - lbc0 - 1;
    const int jc0 = jj / 2 - lbc1 - 1;
    const int kc0 = kk / 2 - lbc2 - 1;

    const double *wx = (ii & 1) ? w_odd : w_even;
    const double *wy = (jj & 1) ? w_odd : w_even;
    const double *wz = (kk & 1) ? w_odd : w_even;

    double value = 0.0;
    for (int ox = 0; ox < 6; ++ox)
    {
      const int cx = ic0 + ox;
      double yz = 0.0;
      for (int oy = 0; oy < 6; ++oy)
      {
        const int cy = jc0 + oy;
        double zsum = 0.0;
        for (int oz = 0; oz < 6; ++oz)
        {
          const int cz = kc0 + oz;
          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
        }
        yz += wy[oy] * zsum;
      }
      value += wx[ox] * yz;
    }

    funf[idx] = value;
  }
}

__global__ void rk4_kernel(int n, double dT,
                           const double *f0,
                           double *f1,
                           double *rhs,
                           int stage)
{
  const double half = 0.5;
  const double one_sixth = 1.0 / 6.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (stage == 0)
    {
      f1[idx] = f0[idx] + half * dT * rhs[idx];
    }
    else if (stage == 1)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + half * dT * f1[idx];
    }
    else if (stage == 2)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + dT * f1[idx];
    }
    else
    {
      f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
    }
  }
}

__global__ void lowerbound_kernel(int n, double *chi, double tinny)
{
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (chi[idx] < tinny)
      chi[idx] = tinny;
  }
}

__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
                                              int has_xmin, int has_ymin, int has_zmin,
                                              int has_xmax, int has_ymax, int has_zmax,
                                              const double *src, double *dst)
{
  const int n = nx * ny * nz;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1))
    {
      dst[idx] = src[idx];
    }
  }
}

__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
                                      const double *X, const double *Y, const double *Z,
                                      double xmin, double ymin, double zmin,
                                      double xmax, double ymax, double zmax,
                                      int has_xmin, int has_ymin, int has_zmin,
                                      int has_xmax, int has_ymax, int has_zmax,
                                      int imin, int jmin, int kmin,
                                      double propspeed,
                                      const double *f0,
                                      double *target,
                                      double soa0, double soa1, double soa2,
                                      int symmetry)
{
  const double one = 1.0;
  const double two = 2.0;
  const int n = nx * ny * nz;
  const double dX = X[1] - X[0];
  const double dY = Y[1] - Y[0];
  const double dZ = Z[1] - Z[0];
  const double d2dx = one / (two * dX);
  const double d2dy = one / (two * dY);
  const double d2dz = one / (two * dZ);
  const double soa[3] = {soa0, soa1, soa2};

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    const bool on_boundary =
        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1);

    if (!on_boundary)
      continue;

    const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
    if (radius == 0.0)
      continue;

    const double wx = propspeed * X[i] / radius;
    const double wy = propspeed * Y[j] / radius;
    const double wz = propspeed * Z[k] / radius;

    double fx = 0.0;
    double fy = 0.0;
    double fz = 0.0;

    if (wx > 0.0)
    {
      if (i - 2 >= imin)
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
      else if (i - 1 >= imin)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wx < 0.0)
    {
      if (i + 2 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (i + 1 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
    }

    if (wy > 0.0)
    {
      if (j - 2 >= jmin)
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
      else if (j - 1 >= jmin)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wy < 0.0)
    {
      if (j + 2 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (j + 1 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
    }

    if (wz > 0.0)
    {
      if (k - 2 >= kmin)
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
      else if (k - 1 >= kmin)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wz < 0.0)
    {
      if (k + 2 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (k + 1 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
    }

    target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
  }
}

inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
{
  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaLaunchKernel", err);
    return false;
  }
  err = cudaPeekAtLastError();
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaPeekAtLastError", err);
    return false;
  }
  return true;
}

} // namespace

int bssn_cuda_enforce_ga(int *ex,
                         double *dxx, double *gxy, double *gxz,
                         double *dyy, double *gyz, double *dzz,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz)
{
  struct EnforceGaCache
  {
    CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
    CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
  };
  static thread_local EnforceGaCache cache;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
            copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
            copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
            copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
            copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
            copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
            copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
            copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
            copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
            copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
            copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
            copy_to_device_preferring_device(cache.Azz, Azz, bytes);

  if (ok)
  {
    double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
    double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
    double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
    double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
    ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
  }

  if (ok)
  {
    // The next GPU RHS stage consumes these fields immediately.
    // Keep them device-resident and expose the mapping so gpu_rhs() can
    // reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
    bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
    bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
    bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
    bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
    bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
    bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
    bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
    bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
    bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
    bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
    bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
    bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
  }

  return ok ? 0 : 1;
}

int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               const double *X, const double *Y, const double *Z,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
                               double propspeed,
                               const double SoA[3],
                               int symmetry,
                               int lev,
                               int rk_stage)
{
  struct Rk4VarCache
  {
    CachedBuffer X, Y, Z;
    CachedBuffer state0, boundary, stage, rhs;
    const double *host_X = nullptr;
    const double *host_Y = nullptr;
    const double *host_Z = nullptr;
    const double *host_state0 = nullptr;
    double *host_rhs = nullptr;
    int nx = 0;
    int ny = 0;
    int nz = 0;
    bool rhs_resident = false;
  };
  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
  Rk4VarCache &cache = cache_map[state0];

  int nx = ex[0];
  int ny = ex[1];
  int nz = ex[2];
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
  const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
  const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  const bool need_bam_boundary = (lev == 0);
  const bool need_coord_copy = need_bam_boundary;
  const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
  const bool need_stage_input = (rk_stage != 0);
  bool ok = true;
  if (need_coord_copy &&
      (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
       cache.nx != nx || cache.ny != ny || cache.nz != nz))
  {
    ok = copy_to_device(cache.X, X, bytes_x) &&
         copy_to_device(cache.Y, Y, bytes_y) &&
         copy_to_device(cache.Z, Z, bytes_z);
    if (ok)
    {
      cache.host_X = X;
      cache.host_Y = Y;
      cache.host_Z = Z;
      cache.nx = nx;
      cache.ny = ny;
      cache.nz = nz;
    }
  }

  const bool refresh_state0 =
      (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
  const bool refresh_rhs =
      (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
  double *stage_ptr = nullptr;
  const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;

  ok = ok &&
       (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
       (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));

  if (ok && need_stage_input)
  {
    if (mapped_stage_ptr)
    {
      stage_ptr = const_cast<double *>(mapped_stage_ptr);
    }
    else
    {
      ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
      stage_ptr = cache.stage.ptr;
    }
  }
  else if (ok)
  {
    ok = ensure_capacity(cache.stage, bytes);
    stage_ptr = cache.stage.ptr;
  }

  if (!ok)
    return 1;

  if (refresh_state0)
  {
    cache.host_state0 = state0;
    bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
  }
  if (refresh_rhs)
  {
    cache.host_rhs = rhs_accum;
    cache.rhs_resident = true;
    bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
  }

  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
  const int no_symm = 0, eq_symm = 1, octant = 2;
  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
  double soa0 = SoA[0];
  double soa1 = SoA[1];
  double soa2 = SoA[2];

  if (need_bam_boundary)
  {
    int imin = 1;
    int jmin = 1;
    int kmin = 1;
    if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;

    double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
    double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
    double *d_rhs = cache.rhs.ptr;
    double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
                    &xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
                    &has_xmin, &has_ymin, &has_zmin,
                    &has_xmax, &has_ymax, &has_zmax,
                    &imin, &jmin, &kmin, &propspeed,
                    &bam_source, &bam_target,
                    &soa0, &soa1, &soa2,
                    &symmetry};
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

  if (ok)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
  }

  if (ok && lev > 0)
  {
      double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
      void *args[] = {&nx, &ny, &nz,
                      &has_xmin, &has_ymin, &has_zmin,
                      &has_xmax, &has_ymax, &has_zmax,
                      &d_state0, &d_stage};
    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(stage_data, stage_ptr);

    cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
    ok = err == cudaSuccess;
  }

  return ok ? 0 : 1;
}

int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
{
  static thread_local CachedBuffer d_chi;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  double *device_chi = nullptr;
  const double *mapped = bssn_gpu_find_device_buffer(chi);
  bool ok = true;
  if (mapped)
  {
    device_chi = const_cast<double *>(mapped);
  }
  else
  {
    ok = copy_to_device_preferring_device(d_chi, chi, bytes);
    device_chi = d_chi.ptr;
  }

  if (ok)
  {
    double *ptr = device_chi;
    void *args[] = {&n, &ptr, &tinny};
    ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(chi, device_chi);
    cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
    ok = err == cudaSuccess;
  }
  return ok ? 0 : 1;
}

int bssn_cuda_prolong3_pack(int wei,
                            const double *llbc, const double *uubc, const int *extc, const double *func,
                            const double *llbf, const double *uubf, const int *extf, double *funf,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry)
{
  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
    return 1;

  // The current input runs with equatorial symmetry enabled.
  // The symmetry-aware prolong CUDA path is not numerically stable yet,
  // so force a safe fallback to the original Fortran implementation.
  if (symmetry != 0)
    return 1;

  auto idint_like = [](double x) -> int {
    return static_cast<int>(std::trunc(x));
  };

  double base[3];
  double CD[3];
  double FD[3];
  int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
  for (int d = 0; d < 3; ++d)
  {
    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
      return 1;

    if (llbc[d] <= llbf[d])
    {
      base[d] = llbc[d];
    }
    else
    {
      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
    }

    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
    lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
    lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
    ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
    ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
    (void)ubc[d];
    (void)ubf[d];
  }

  const int imino = lbp[0] - lbf[0] + 1;
  const int imaxo = ubp[0] - lbf[0] + 1;
  const int jmino = lbp[1] - lbf[1] + 1;
  const int jmaxo = ubp[1] - lbf[1] + 1;
  const int kmino = lbp[2] - lbf[2] + 1;
  const int kmaxo = ubp[2] - lbf[2] + 1;

  const int imini = lbpc[0] - lbc[0] + 1;
  const int imaxi = ubpc[0] - lbc[0] + 1;
  const int jmini = lbpc[1] - lbc[1] + 1;
  const int jmaxi = ubpc[1] - lbc[1] + 1;
  const int kmini = lbpc[2] - lbc[2] + 1;
  const int kmaxi = ubpc[2] - lbc[2] + 1;

  if (imino < 1 || jmino < 1 || kmino < 1 ||
      imini < 1 || jmini < 1 || kmini < 1 ||
      imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
      imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
  {
    return 1;
  }

  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
    int ii = fine_idx + lbf_d - 1;
    return ii / 2 - lbc_d + 1;
  };
  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);

  // Current CUDA prolong path only supports the same fast path as the
  // optimized Fortran code: interior stencil access without symmetry_bd().
  // If the stencil touches the symmetry boundary, fall back to Fortran.
  if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
    return 1;

  struct ProlongCache
  {
    CachedBuffer coarse;
    CachedBuffer fine;
  };
  static thread_local ProlongCache cache;

  int nxc = extc[0], nyc = extc[1], nzc = extc[2];
  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
  int sxc = nxc + 3;
  int syc = nyc + 3;
  int szc = nzc + 3;
  int coarse_points = sxc * syc * szc;
  int fine_points = nxf * nyf * nzf;
  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);

  std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
  auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
    return static_cast<size_t>(fi + 2) +
           static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
           static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
  };
  auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
    return static_cast<size_t>(i - 1) +
           static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
           static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
  };

  for (int k = 1; k <= nzc; ++k)
  {
    for (int j = 1; j <= nyc; ++j)
    {
      for (int i = 1; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_i = -offset;
    int source_i = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int j = 1; j <= nyc; ++j)
      {
        funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_j = -offset;
    int source_j = offset + 2;
    for (int k = 1; k <= nzc; ++k)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
      }
    }
  }

  for (int offset = 0; offset < 3; ++offset)
  {
    int target_k = -offset;
    int source_k = offset + 2;
    for (int j = -2; j <= nyc; ++j)
    {
      for (int i = -2; i <= nxc; ++i)
      {
        funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
      }
    }
  }

  double *d_func = nullptr;
  if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
    return 1;
  d_func = cache.coarse.ptr;

  if (!ensure_capacity(cache.fine, fine_bytes))
  {
    return 1;
  }

  dim3 block(256);
  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_func, &d_funf,
                  &sxc, &syc,
                  &nxf, &nyf, &nzf,
                  &lbc[0], &lbc[1], &lbc[2],
                  &lbf[0], &lbf[1], &lbf[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
                  &kbegin, &kend};

  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
    return 1;

  cudaError_t sync_err = cudaDeviceSynchronize();
  if (sync_err != cudaSuccess)
  {
    report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
    return 1;
  }

  cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
    return 1;
  }
  return 0;
}