Add runnable GPU main-path prototype

2026-04-08 19:14:37 +08:00
parent 8c1f4d8108
commit ea470737db
10 changed files with 1068 additions and 39 deletions
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -29,16 +29,11 @@ using namespace std;
 #error "not define ABEtype"
 #endif
-#if (ABEtype == 0)
+#if (ABEtype == 0)
-
+#include "bssn_class.h"
-#ifdef USE_GPU
+
-#include "bssn_gpu_class.h"
+#elif (ABEtype == 1)
-#else
+#include "bssnEScalar_class.h"
 #include "bssn_class.h"
 #endif
 #elif (ABEtype == 1)
 #include "bssnEScalar_class.h"
 #elif (ABEtype == 2)
 #include "Z4c_class.h"
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -3026,9 +3026,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
+void bssn_class::Step(int lev, int YN)
-{
+{
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
+#ifdef USE_GPU
  Step_MainPath_GPU(lev, YN);
  return;
 #endif
  setpbh(BH_num, Porg0, Mass, BH_num_input);
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -171,16 +171,19 @@ public:
       bool check_Stdin_Abort(); 
-       virtual void Setup_Initial_Data_Cao();
+       virtual void Setup_Initial_Data_Cao();
-       virtual void Setup_Initial_Data_Lousto();
+       virtual void Setup_Initial_Data_Lousto();
-       virtual void Initialize();
+       virtual void Initialize();
-       virtual void Read_Ansorg();
+       virtual void Read_Ansorg();
-       virtual void Read_Pablo() {};
+       virtual void Read_Pablo() {};
-       virtual void Compute_Psi4(int lev);
+       virtual void Compute_Psi4(int lev);
-       virtual void Step(int lev, int YN);
+       virtual void Step(int lev, int YN);
-       virtual void Interp_Constraint(bool infg);
+#ifdef USE_GPU
-       virtual void Constraint_Out();
+       void Step_MainPath_GPU(int lev, int YN);
-       virtual void Compute_Constraint();
+#endif
       virtual void Interp_Constraint(bool infg);
       virtual void Constraint_Out();
       virtual void Compute_Constraint();
 #ifdef With_AHF
 protected:
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -0,0 +1,572 @@
 #include "bssn_cuda_ops.h"
 #include <cmath>
 #include <cstdio>
 #include <cuda_runtime.h>
 namespace {
 inline void report_cuda_error(const char *where, cudaError_t err)
 {
  if (err != cudaSuccess)
    std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
 }
 inline int count_points(const int ex[3])
 {
  return ex[0] * ex[1] * ex[2];
 }
 inline int div_up(int a, int b)
 {
  return (a + b - 1) / b;
 }
 struct DeviceArrays
 {
  double *x = nullptr;
  double *y = nullptr;
  double *z = nullptr;
  double *a = nullptr;
  double *b = nullptr;
  double *c = nullptr;
  double *d = nullptr;
 };
 inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
 {
  cudaError_t err = cudaMalloc(&dst, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }
  err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
 }
 inline void free_device(double *ptr)
 {
  if (ptr)
    cudaFree(ptr);
 }
 __global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
                                  double *Axx, double *Axy, double *Axz,
                                  double *Ayy, double *Ayz, double *Azz)
 {
  const double one = 1.0;
  const double two = 2.0;
  const double one_third = 1.0 / 3.0;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    double lgxx = dxx[idx] + one;
    double lgyy = dyy[idx] + one;
    double lgzz = dzz[idx] + one;
    double lgxy = gxy[idx];
    double lgxz = gxz[idx];
    double lgyz = gyz[idx];
    double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
               - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
    double scale = 1.0 / cbrt(det);
    lgxx *= scale;
    lgxy *= scale;
    lgxz *= scale;
    lgyy *= scale;
    lgyz *= scale;
    lgzz *= scale;
    dxx[idx] = lgxx - one;
    gxy[idx] = lgxy;
    gxz[idx] = lgxz;
    dyy[idx] = lgyy - one;
    gyz[idx] = lgyz;
    dzz[idx] = lgzz - one;
    double gupxx = (lgyy * lgzz - lgyz * lgyz);
    double gupxy = -(lgxy * lgzz - lgyz * lgxz);
    double gupxz = (lgxy * lgyz - lgyy * lgxz);
    double gupyy = (lgxx * lgzz - lgxz * lgxz);
    double gupyz = -(lgxx * lgyz - lgxy * lgxz);
    double gupzz = (lgxx * lgyy - lgxy * lgxy);
    double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
               + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
    Axx[idx] -= one_third * lgxx * trA;
    Axy[idx] -= one_third * lgxy * trA;
    Axz[idx] -= one_third * lgxz * trA;
    Ayy[idx] -= one_third * lgyy * trA;
    Ayz[idx] -= one_third * lgyz * trA;
    Azz[idx] -= one_third * lgzz * trA;
  }
 }
 __device__ inline int index3(int i, int j, int k, int nx, int ny)
 {
  return i + j * nx + k * nx * ny;
 }
 __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
                                            int nx, int ny, int nz,
                                            const double soa[3], int symmetry)
 {
  double sign = 1.0;
  if (i < 0)
  {
    i = -i - 1;
    sign *= soa[0];
  }
  if (j < 0)
  {
    j = -j - 1;
    sign *= soa[1];
  }
  if (k < 0)
  {
    k = -k - 1;
    sign *= soa[2];
  }
  if (i >= nx) i = nx - 1;
  if (j >= ny) j = ny - 1;
  if (k >= nz) k = nz - 1;
  (void)symmetry;
  return sign * f[index3(i, j, k, nx, ny)];
 }
 __global__ void rk4_kernel(int n, double dT,
                           const double *f0,
                           double *f1,
                           double *rhs,
                           int stage)
 {
  const double half = 0.5;
  const double one_sixth = 1.0 / 6.0;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (stage == 0)
    {
      f1[idx] = f0[idx] + half * dT * rhs[idx];
    }
    else if (stage == 1)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + half * dT * f1[idx];
    }
    else if (stage == 2)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + dT * f1[idx];
    }
    else
    {
      f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
    }
  }
 }
 __global__ void lowerbound_kernel(int n, double *chi, double tinny)
 {
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (chi[idx] < tinny)
      chi[idx] = tinny;
  }
 }
 __global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
                                              int has_xmin, int has_ymin, int has_zmin,
                                              int has_xmax, int has_ymax, int has_zmax,
                                              const double *src, double *dst)
 {
  const int n = nx * ny * nz;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;
    if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1))
    {
      dst[idx] = src[idx];
    }
  }
 }
 __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
                                      const double *X, const double *Y, const double *Z,
                                      double xmin, double ymin, double zmin,
                                      double xmax, double ymax, double zmax,
                                      int has_xmin, int has_ymin, int has_zmin,
                                      int has_xmax, int has_ymax, int has_zmax,
                                      int imin, int jmin, int kmin,
                                      double propspeed,
                                      const double *f0,
                                      double *target,
                                      double soa0, double soa1, double soa2,
                                      int symmetry)
 {
  const double one = 1.0;
  const double two = 2.0;
  const int n = nx * ny * nz;
  const double dX = X[1] - X[0];
  const double dY = Y[1] - Y[0];
  const double dZ = Z[1] - Z[0];
  const double d2dx = one / (two * dX);
  const double d2dy = one / (two * dY);
  const double d2dz = one / (two * dZ);
  const double soa[3] = {soa0, soa1, soa2};
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;
    const bool on_boundary =
        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1);
    if (!on_boundary)
      continue;
    const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
    if (radius == 0.0)
      continue;
    const double wx = propspeed * X[i] / radius;
    const double wy = propspeed * Y[j] / radius;
    const double wz = propspeed * Z[k] / radius;
    double fx = 0.0;
    double fy = 0.0;
    double fz = 0.0;
    if (wx > 0.0)
    {
      if (i - 2 >= imin)
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
      else if (i - 1 >= imin)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wx < 0.0)
    {
      if (i + 2 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (i + 1 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
    }
    if (wy > 0.0)
    {
      if (j - 2 >= jmin)
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
      else if (j - 1 >= jmin)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wy < 0.0)
    {
      if (j + 2 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (j + 1 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
    }
    if (wz > 0.0)
    {
      if (k - 2 >= kmin)
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
      else if (k - 1 >= kmin)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wz < 0.0)
    {
      if (k + 2 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (k + 1 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
    }
    target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
  }
 }
 inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
 {
  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaLaunchKernel", err);
    return false;
  }
  err = cudaDeviceSynchronize();
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaDeviceSynchronize", err);
    return false;
  }
  return true;
 }
 } // namespace
 int bssn_cuda_enforce_ga(int *ex,
                         double *dxx, double *gxy, double *gxz,
                         double *dyy, double *gyz, double *dzz,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz)
 {
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));
  double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
  double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
  double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
  double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
  bool ok = copy_to_device(d_dxx, dxx, bytes) &&
            copy_to_device(d_gxy, gxy, bytes) &&
            copy_to_device(d_gxz, gxz, bytes) &&
            copy_to_device(d_dyy, dyy, bytes) &&
            copy_to_device(d_gyz, gyz, bytes) &&
            copy_to_device(d_dzz, dzz, bytes) &&
            copy_to_device(d_Axx, Axx, bytes) &&
            copy_to_device(d_Axy, Axy, bytes) &&
            copy_to_device(d_Axz, Axz, bytes) &&
            copy_to_device(d_Ayy, Ayy, bytes) &&
            copy_to_device(d_Ayz, Ayz, bytes) &&
            copy_to_device(d_Azz, Azz, bytes);
  if (ok)
  {
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
    ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
  }
  if (ok)
  {
    cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
    ok = err == cudaSuccess;
    if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
    if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
  }
  free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
  free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
  free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
  free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
  return ok ? 0 : 1;
 }
 int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               const double *X, const double *Y, const double *Z,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
                               double propspeed,
                               const double SoA[3],
                               int symmetry,
                               int lev,
                               int rk_stage)
 {
  int nx = ex[0];
  int ny = ex[1];
  int nz = ex[2];
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
  const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
  const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));
  double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
  double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
  bool ok = copy_to_device(d_X, X, bytes_x) &&
            copy_to_device(d_Y, Y, bytes_y) &&
            copy_to_device(d_Z, Z, bytes_z) &&
            copy_to_device(d_state0, state0, bytes) &&
            copy_to_device(d_boundary, boundary_src, bytes) &&
            copy_to_device(d_stage, stage_data, bytes) &&
            copy_to_device(d_rhs, rhs_accum, bytes);
  if (!ok)
  {
    free_device(d_X); free_device(d_Y); free_device(d_Z);
    free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
    return 1;
  }
  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
  const int no_symm = 0, eq_symm = 1, octant = 2;
  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
  double soa0 = SoA[0];
  double soa1 = SoA[1];
  double soa2 = SoA[2];
  if (lev == 0)
  {
    int imin = 1;
    int jmin = 1;
    int kmin = 1;
    if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
    double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
                    &xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
                    &has_xmin, &has_ymin, &has_zmin,
                    &has_xmax, &has_ymax, &has_zmax,
                    &imin, &jmin, &kmin, &propspeed,
                    &bam_source, &bam_target,
                    &soa0, &soa1, &soa2,
                    &symmetry};
    ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }
  if (ok)
  {
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
  }
  if (ok && lev > 0)
  {
      void *args[] = {&nx, &ny, &nz,
                      &has_xmin, &has_ymin, &has_zmin,
                      &has_xmax, &has_ymax, &has_zmax,
                      &d_state0, &d_stage};
    ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
  }
  if (ok)
  {
    cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
    ok = err == cudaSuccess;
    if (ok)
    {
      err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
      ok = err == cudaSuccess;
    }
  }
  free_device(d_X); free_device(d_Y); free_device(d_Z);
  free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
  return ok ? 0 : 1;
 }
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
 {
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));
  double *d_chi = nullptr;
  bool ok = copy_to_device(d_chi, chi, bytes);
  if (ok)
  {
    void *args[] = {&n, &d_chi, &tinny};
    ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
  }
  if (ok)
  {
    cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
    ok = err == cudaSuccess;
  }
  free_device(d_chi);
  return ok ? 0 : 1;
 }
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -0,0 +1,26 @@
 #ifndef BSSN_CUDA_OPS_H
 #define BSSN_CUDA_OPS_H
 int bssn_cuda_enforce_ga(int *ex,
                         double *dxx, double *gxy, double *gxz,
                         double *dyy, double *gyz, double *dzz,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz);
 int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               const double *X, const double *Y, const double *Z,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
                               double propspeed,
                               const double SoA[3],
                               int symmetry,
                               int lev,
                               int rk_stage);
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
 #endif
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -0,0 +1,364 @@
 #include "macrodef.h"
 #ifdef USE_GPU
 #include <cmath>
 #include <vector>
 #include "bssn_class.h"
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
 #include "bssn_macro.h"
 #include "rungekutta4_rout.h"
 void bssn_class::Step_MainPath_GPU(int lev, int YN)
 {
 #ifdef WithShell
 #error "Step_MainPath_GPU currently supports Patch grids only."
 #endif
  setpbh(BH_num, Porg0, Mass, BH_num_input);
  const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #if (MAPBH == 1)
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      for (int ith = 0; ith < 3; ith++)
        Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
      if (Symmetry > 0)
        Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
      if (Symmetry == 2)
      {
        Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
        Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
      }
    }
  }
  if (lev == a_lev)
    AnalysisStuff(lev, dT_lev);
 #endif
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
  const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
  (void)BB;
  double ndeps = (lev < GH->movls) ? numepsb : numepss;
  double TRK4 = PhysTime;
  int iter_count = 0;
  int pre = 0, cor = 1;
  int ERROR = 0;
  auto run_stage_on_block =
      [&](Block *cg, Patch *patch, MyList<var> *state0_list,
          MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
          MyList<var> *rhs_list, int rk_stage) {
        MyList<var> *varl0 = state0_list;
        MyList<var> *varlb = boundary_src_list;
        MyList<var> *varls = stage_data_list;
        MyList<var> *varlr = rhs_list;
        while (varl0)
        {
          if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
                                         cg->X[0], cg->X[1], cg->X[2],
                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
                                         patch->bbox[3], patch->bbox[4], patch->bbox[5],
                                         cg->fgfs[varl0->data->sgfn],
                                         cg->fgfs[varlb->data->sgfn],
                                         cg->fgfs[varls->data->sgfn],
                                         cg->fgfs[varlr->data->sgfn],
                                         varl0->data->propspeed,
                                         varl0->data->SoA,
                                         Symmetry, lev, rk_stage))
          {
            cerr << "GPU rk4/boundary failure: lev=" << lev
                 << " rk_stage=" << rk_stage
                 << " var=" << varl0->data->name
                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
            break;
          }
          varl0 = varl0->next;
          varlb = varlb->next;
          varls = varls->next;
          varlr = varlr->next;
        }
      };
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
 #if (AGM == 0)
        if (bssn_cuda_enforce_ga(cg->shape,
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]))
        {
          cerr << "GPU enforce_ga failure: lev=" << lev
               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
               << cg->bbox[1] << ":" << cg->bbox[4] << ","
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
 #endif
        if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
          ERROR = 1;
        run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
        if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
        {
          cerr << "GPU lowerbound failure: lev=" << lev
               << " rk_stage=" << iter_count
               << " var=" << phi->name
               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
               << cg->bbox[1] << ":" << cg->bbox[4] << ","
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  MPI_Request err_req_pre;
  {
    int erh = ERROR;
    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
  }
  Parallel::AsyncSyncState async_pre;
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #if (MAPBH == 0)
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
      if (Symmetry > 0)
        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
      if (Symmetry == 2)
      {
        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
      }
    }
  }
  if (lev == a_lev)
    AnalysisStuff(lev, dT_lev);
 #endif
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
    Pp = GH->PatL[lev];
    while (Pp)
    {
      MyList<Block> *BP = Pp->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
 #if (AGM == 0)
          if (bssn_cuda_enforce_ga(cg->shape,
                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                                   cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
                                   cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
          {
            cerr << "GPU enforce_ga failure: lev=" << lev
                 << " rk_stage=" << iter_count
                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
 #elif (AGM == 1)
          if (iter_count == 3 &&
              bssn_cuda_enforce_ga(cg->shape,
                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                                   cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
                                   cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
          {
            cerr << "GPU enforce_ga failure: lev=" << lev
                 << " rk_stage=" << iter_count
                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
 #endif
          if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
            ERROR = 1;
          run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
          if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
          {
            cerr << "GPU lowerbound failure: lev=" << lev
                 << " rk_stage=" << iter_count
                 << " var=" << phi1->name
                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
        }
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
      }
      Pp = Pp->next;
    }
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }
    Parallel::AsyncSyncState async_cor;
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #if (MAPBH == 0)
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
        if (Symmetry > 0)
          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
        if (Symmetry == 2)
        {
          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
        }
      }
    }
 #endif
    if (iter_count < 3)
    {
      Pp = GH->PatL[lev];
      while (Pp)
      {
        MyList<Block> *BP = Pp->data->blb;
        while (BP)
        {
          BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
          if (BP == Pp->data->ble)
            break;
          BP = BP->next;
        }
        Pp = Pp->next;
      }
 #if (MAPBH == 0)
      if (BH_num > 0 && lev == GH->levels - 1)
      {
        for (int ithBH = 0; ithBH < BH_num; ithBH++)
        {
          Porg[ithBH][0] = Porg1[ithBH][0];
          Porg[ithBH][1] = Porg1[ithBH][1];
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
 #endif
    }
  }
 #if (RPS == 0)
  RestrictProlong(lev, YN, BB);
 #endif
  Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      cg->swapList(StateList, SynchList_cor, myrank);
      cg->swapList(OldStateList, SynchList_cor, myrank);
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      Porg0[ithBH][0] = Porg1[ithBH][0];
      Porg0[ithBH][1] = Porg1[ithBH][1];
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
 }
 #endif
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -18,12 +18,18 @@ using namespace std;
 #include <fstream>
 #endif
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+void compare_result_gpu(int ftag1,double * datac,int data_num){
-	double * data = (double*)malloc(sizeof(double)*data_num);
+#ifdef RESULT_CHECK
-	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
+	double * data = (double*)malloc(sizeof(double)*data_num);
-	compare_result(ftag1,data,data_num);
+	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
-	free(data);
+	compare_result(ftag1,data,data_num);
-}
+	free(data);
 #else
 	(void)ftag1;
 	(void)datac;
 	(void)data_num;
 #endif
 }
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
 	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
-C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-           cgh.o surface_integral.o ShellPatch.o\
+           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o \
+	   NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
 F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
 TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 
-CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
+CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
 # file dependences
 $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -9,6 +9,7 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
 CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@
 import AMSS_NCKU_Input as input_data
 import os
 import subprocess
 import time
@@ -57,6 +58,48 @@ BUILD_JOBS = 64
 ##################################################################
 ##################################################################
 def prepare_gpu_runtime_env():
    """
    Create a user-private CUDA MPS environment for GPU runs.
    On shared machines another user's daemon may already occupy the default
    /tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
    fail with cudaErrorMpsConnectionFailed.  Binding AMSS-NCKU to a private
    pipe directory avoids cross-user interference.
    """
    env = os.environ.copy()
    pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
    log_dir  = env.get("CUDA_MPS_LOG_DIRECTORY",  f"/tmp/amss-ncku-mps-log-{os.getuid()}")
    os.makedirs(pipe_dir, exist_ok=True)
    os.makedirs(log_dir, exist_ok=True)
    env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
    env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
    control_socket = os.path.join(pipe_dir, "control")
    if not os.path.exists(control_socket):
        start = subprocess.run(
            ["nvidia-cuda-mps-control", "-d"],
            env=env,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        if start.returncode != 0:
            print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
        else:
            print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
    else:
        print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
    return env
 ##################################################################
 ##################################################################
@@ -146,16 +189,29 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    run_env = None
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        run_env = prepare_gpu_runtime_env()
        if int(input_data.MPI_processes) == 1:
            mpi_command = "./ABEGPU"
        else:
            mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    mpi_process = subprocess.Popen(
        mpi_command,
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        env=run_env,
    )
    ## Write ABE run output to file while printing to stdout
    with open(mpi_command_outfile, 'w') as file0: