diff --git a/AMSS_NCKU_source/ABE.C b/AMSS_NCKU_source/ABE.C
index 9a4874e..83d2032 100644
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -29,16 +29,11 @@ using namespace std;
 #error "not define ABEtype"
 #endif
 
-#if (ABEtype == 0)
-
-#ifdef USE_GPU
-#include "bssn_gpu_class.h"
-#else
-#include "bssn_class.h"
-#endif
-
-#elif (ABEtype == 1)
-#include "bssnEScalar_class.h"
+#if (ABEtype == 0)
+#include "bssn_class.h"
+
+#elif (ABEtype == 1)
+#include "bssnEScalar_class.h"
 
 #elif (ABEtype == 2)
 #include "Z4c_class.h"
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index bcd9117..179233e 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -3026,9 +3026,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
 
 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
-{
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
+void bssn_class::Step(int lev, int YN)
+{
+#ifdef USE_GPU
+  Step_MainPath_GPU(lev, YN);
+  return;
+#endif
+
+  setpbh(BH_num, Porg0, Mass, BH_num_input);
 
   double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 
diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h
index 5a8eb2a..94fd306 100644
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -171,16 +171,19 @@ public:
        
        bool check_Stdin_Abort(); 
 
-       virtual void Setup_Initial_Data_Cao();
-       virtual void Setup_Initial_Data_Lousto();
-       virtual void Initialize();
-       virtual void Read_Ansorg();
-       virtual void Read_Pablo() {};
-       virtual void Compute_Psi4(int lev);
-       virtual void Step(int lev, int YN);
-       virtual void Interp_Constraint(bool infg);
-       virtual void Constraint_Out();
-       virtual void Compute_Constraint();
+       virtual void Setup_Initial_Data_Cao();
+       virtual void Setup_Initial_Data_Lousto();
+       virtual void Initialize();
+       virtual void Read_Ansorg();
+       virtual void Read_Pablo() {};
+       virtual void Compute_Psi4(int lev);
+       virtual void Step(int lev, int YN);
+#ifdef USE_GPU
+       void Step_MainPath_GPU(int lev, int YN);
+#endif
+       virtual void Interp_Constraint(bool infg);
+       virtual void Constraint_Out();
+       virtual void Compute_Constraint();
 
 #ifdef With_AHF
 protected:
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu
new file mode 100644
index 0000000..6ef14d0
--- /dev/null
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -0,0 +1,572 @@
+#include "bssn_cuda_ops.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cuda_runtime.h>
+
+namespace {
+
+inline void report_cuda_error(const char *where, cudaError_t err)
+{
+  if (err != cudaSuccess)
+    std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
+}
+
+inline int count_points(const int ex[3])
+{
+  return ex[0] * ex[1] * ex[2];
+}
+
+inline int div_up(int a, int b)
+{
+  return (a + b - 1) / b;
+}
+
+struct DeviceArrays
+{
+  double *x = nullptr;
+  double *y = nullptr;
+  double *z = nullptr;
+  double *a = nullptr;
+  double *b = nullptr;
+  double *c = nullptr;
+  double *d = nullptr;
+};
+
+inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
+{
+  cudaError_t err = cudaMalloc(&dst, bytes);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMalloc", err);
+    return false;
+  }
+  err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D)", err);
+    return false;
+  }
+  return true;
+}
+
+inline void free_device(double *ptr)
+{
+  if (ptr)
+    cudaFree(ptr);
+}
+
+__global__ void enforce_ga_kernel(int n,
+                                  double *dxx, double *gxy, double *gxz,
+                                  double *dyy, double *gyz, double *dzz,
+                                  double *Axx, double *Axy, double *Axz,
+                                  double *Ayy, double *Ayz, double *Azz)
+{
+  const double one = 1.0;
+  const double two = 2.0;
+  const double one_third = 1.0 / 3.0;
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    double lgxx = dxx[idx] + one;
+    double lgyy = dyy[idx] + one;
+    double lgzz = dzz[idx] + one;
+    double lgxy = gxy[idx];
+    double lgxz = gxz[idx];
+    double lgyz = gyz[idx];
+
+    double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
+               - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
+    double scale = 1.0 / cbrt(det);
+
+    lgxx *= scale;
+    lgxy *= scale;
+    lgxz *= scale;
+    lgyy *= scale;
+    lgyz *= scale;
+    lgzz *= scale;
+
+    dxx[idx] = lgxx - one;
+    gxy[idx] = lgxy;
+    gxz[idx] = lgxz;
+    dyy[idx] = lgyy - one;
+    gyz[idx] = lgyz;
+    dzz[idx] = lgzz - one;
+
+    double gupxx = (lgyy * lgzz - lgyz * lgyz);
+    double gupxy = -(lgxy * lgzz - lgyz * lgxz);
+    double gupxz = (lgxy * lgyz - lgyy * lgxz);
+    double gupyy = (lgxx * lgzz - lgxz * lgxz);
+    double gupyz = -(lgxx * lgyz - lgxy * lgxz);
+    double gupzz = (lgxx * lgyy - lgxy * lgxy);
+
+    double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
+               + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
+
+    Axx[idx] -= one_third * lgxx * trA;
+    Axy[idx] -= one_third * lgxy * trA;
+    Axz[idx] -= one_third * lgxz * trA;
+    Ayy[idx] -= one_third * lgyy * trA;
+    Ayz[idx] -= one_third * lgyz * trA;
+    Azz[idx] -= one_third * lgzz * trA;
+  }
+}
+
+__device__ inline int index3(int i, int j, int k, int nx, int ny)
+{
+  return i + j * nx + k * nx * ny;
+}
+
+__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
+                                            int nx, int ny, int nz,
+                                            const double soa[3], int symmetry)
+{
+  double sign = 1.0;
+  if (i < 0)
+  {
+    i = -i - 1;
+    sign *= soa[0];
+  }
+  if (j < 0)
+  {
+    j = -j - 1;
+    sign *= soa[1];
+  }
+  if (k < 0)
+  {
+    k = -k - 1;
+    sign *= soa[2];
+  }
+  if (i >= nx) i = nx - 1;
+  if (j >= ny) j = ny - 1;
+  if (k >= nz) k = nz - 1;
+  (void)symmetry;
+  return sign * f[index3(i, j, k, nx, ny)];
+}
+
+__global__ void rk4_kernel(int n, double dT,
+                           const double *f0,
+                           double *f1,
+                           double *rhs,
+                           int stage)
+{
+  const double half = 0.5;
+  const double one_sixth = 1.0 / 6.0;
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    if (stage == 0)
+    {
+      f1[idx] = f0[idx] + half * dT * rhs[idx];
+    }
+    else if (stage == 1)
+    {
+      rhs[idx] += 2.0 * f1[idx];
+      f1[idx] = f0[idx] + half * dT * f1[idx];
+    }
+    else if (stage == 2)
+    {
+      rhs[idx] += 2.0 * f1[idx];
+      f1[idx] = f0[idx] + dT * f1[idx];
+    }
+    else
+    {
+      f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
+    }
+  }
+}
+
+__global__ void lowerbound_kernel(int n, double *chi, double tinny)
+{
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    if (chi[idx] < tinny)
+      chi[idx] = tinny;
+  }
+}
+
+__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
+                                              int has_xmin, int has_ymin, int has_zmin,
+                                              int has_xmax, int has_ymax, int has_zmax,
+                                              const double *src, double *dst)
+{
+  const int n = nx * ny * nz;
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    const int plane = nx * ny;
+    const int k = idx / plane;
+    const int rem = idx - k * plane;
+    const int j = rem / nx;
+    const int i = rem - j * nx;
+
+    if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
+        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
+        (has_zmin && k == 0) || (has_zmax && k == nz - 1))
+    {
+      dst[idx] = src[idx];
+    }
+  }
+}
+
+__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
+                                      const double *X, const double *Y, const double *Z,
+                                      double xmin, double ymin, double zmin,
+                                      double xmax, double ymax, double zmax,
+                                      int has_xmin, int has_ymin, int has_zmin,
+                                      int has_xmax, int has_ymax, int has_zmax,
+                                      int imin, int jmin, int kmin,
+                                      double propspeed,
+                                      const double *f0,
+                                      double *target,
+                                      double soa0, double soa1, double soa2,
+                                      int symmetry)
+{
+  const double one = 1.0;
+  const double two = 2.0;
+  const int n = nx * ny * nz;
+  const double dX = X[1] - X[0];
+  const double dY = Y[1] - Y[0];
+  const double dZ = Z[1] - Z[0];
+  const double d2dx = one / (two * dX);
+  const double d2dy = one / (two * dY);
+  const double d2dz = one / (two * dZ);
+  const double soa[3] = {soa0, soa1, soa2};
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    const int plane = nx * ny;
+    const int k = idx / plane;
+    const int rem = idx - k * plane;
+    const int j = rem / nx;
+    const int i = rem - j * nx;
+
+    const bool on_boundary =
+        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
+        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
+        (has_zmin && k == 0) || (has_zmax && k == nz - 1);
+
+    if (!on_boundary)
+      continue;
+
+    const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
+    if (radius == 0.0)
+      continue;
+
+    const double wx = propspeed * X[i] / radius;
+    const double wy = propspeed * Y[j] / radius;
+    const double wz = propspeed * Z[k] / radius;
+
+    double fx = 0.0;
+    double fy = 0.0;
+    double fz = 0.0;
+
+    if (wx > 0.0)
+    {
+      if (i - 2 >= imin)
+        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
+      else if (i - 1 >= imin)
+        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
+      else
+        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+    }
+    else if (wx < 0.0)
+    {
+      if (i + 2 <= nx - 1)
+        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+      else if (i + 1 <= nx - 1)
+        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
+      else
+        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
+    }
+
+    if (wy > 0.0)
+    {
+      if (j - 2 >= jmin)
+        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
+      else if (j - 1 >= jmin)
+        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
+      else
+        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+    }
+    else if (wy < 0.0)
+    {
+      if (j + 2 <= ny - 1)
+        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+      else if (j + 1 <= ny - 1)
+        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
+      else
+        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
+    }
+
+    if (wz > 0.0)
+    {
+      if (k - 2 >= kmin)
+        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
+      else if (k - 1 >= kmin)
+        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
+      else
+        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+    }
+    else if (wz < 0.0)
+    {
+      if (k + 2 <= nz - 1)
+        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
+                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
+                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
+      else if (k + 1 <= nz - 1)
+        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
+      else
+        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
+                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
+    }
+
+    target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
+  }
+}
+
+inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
+{
+  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaLaunchKernel", err);
+    return false;
+  }
+  err = cudaDeviceSynchronize();
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaDeviceSynchronize", err);
+    return false;
+  }
+  return true;
+}
+
+} // namespace
+
+int bssn_cuda_enforce_ga(int *ex,
+                         double *dxx, double *gxy, double *gxz,
+                         double *dyy, double *gyz, double *dzz,
+                         double *Axx, double *Axy, double *Axz,
+                         double *Ayy, double *Ayz, double *Azz)
+{
+  int n = count_points(ex);
+  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
+  dim3 block(256);
+  dim3 grid(div_up(n, static_cast<int>(block.x)));
+
+  double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
+  double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
+  double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
+  double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
+
+  bool ok = copy_to_device(d_dxx, dxx, bytes) &&
+            copy_to_device(d_gxy, gxy, bytes) &&
+            copy_to_device(d_gxz, gxz, bytes) &&
+            copy_to_device(d_dyy, dyy, bytes) &&
+            copy_to_device(d_gyz, gyz, bytes) &&
+            copy_to_device(d_dzz, dzz, bytes) &&
+            copy_to_device(d_Axx, Axx, bytes) &&
+            copy_to_device(d_Axy, Axy, bytes) &&
+            copy_to_device(d_Axz, Axz, bytes) &&
+            copy_to_device(d_Ayy, Ayy, bytes) &&
+            copy_to_device(d_Ayz, Ayz, bytes) &&
+            copy_to_device(d_Azz, Azz, bytes);
+
+  if (ok)
+  {
+    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
+                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
+    ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
+  }
+
+  if (ok)
+  {
+    cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
+    ok = err == cudaSuccess;
+    if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
+  }
+
+  free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
+  free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
+  free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
+  free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
+
+  return ok ? 0 : 1;
+}
+
+int bssn_cuda_rk4_boundary_var(int *ex, double dT,
+                               const double *X, const double *Y, const double *Z,
+                               double xmin, double ymin, double zmin,
+                               double xmax, double ymax, double zmax,
+                               const double *state0,
+                               const double *boundary_src,
+                               double *stage_data,
+                               double *rhs_accum,
+                               double propspeed,
+                               const double SoA[3],
+                               int symmetry,
+                               int lev,
+                               int rk_stage)
+{
+  int nx = ex[0];
+  int ny = ex[1];
+  int nz = ex[2];
+  int n = count_points(ex);
+  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
+  const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
+  const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
+  const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
+  dim3 block(256);
+  dim3 grid(div_up(n, static_cast<int>(block.x)));
+
+  double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
+  double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
+
+  bool ok = copy_to_device(d_X, X, bytes_x) &&
+            copy_to_device(d_Y, Y, bytes_y) &&
+            copy_to_device(d_Z, Z, bytes_z) &&
+            copy_to_device(d_state0, state0, bytes) &&
+            copy_to_device(d_boundary, boundary_src, bytes) &&
+            copy_to_device(d_stage, stage_data, bytes) &&
+            copy_to_device(d_rhs, rhs_accum, bytes);
+
+  if (!ok)
+  {
+    free_device(d_X); free_device(d_Y); free_device(d_Z);
+    free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
+    return 1;
+  }
+
+  double dX = X[1] - X[0];
+  double dY = Y[1] - Y[0];
+  double dZ = Z[1] - Z[0];
+  const int no_symm = 0, eq_symm = 1, octant = 2;
+  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
+  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
+  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
+  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
+  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
+  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
+  double soa0 = SoA[0];
+  double soa1 = SoA[1];
+  double soa2 = SoA[2];
+
+  if (lev == 0)
+  {
+    int imin = 1;
+    int jmin = 1;
+    int kmin = 1;
+    if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
+    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
+    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
+
+    double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
+    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
+    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
+                    &xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
+                    &has_xmin, &has_ymin, &has_zmin,
+                    &has_xmax, &has_ymax, &has_zmax,
+                    &imin, &jmin, &kmin, &propspeed,
+                    &bam_source, &bam_target,
+                    &soa0, &soa1, &soa2,
+                    &symmetry};
+    ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
+  }
+
+  if (ok)
+  {
+    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
+    ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
+  }
+
+  if (ok && lev > 0)
+  {
+      void *args[] = {&nx, &ny, &nz,
+                      &has_xmin, &has_ymin, &has_zmin,
+                      &has_xmax, &has_ymax, &has_zmax,
+                      &d_state0, &d_stage};
+    ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
+  }
+
+  if (ok)
+  {
+    cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
+    ok = err == cudaSuccess;
+    if (ok)
+    {
+      err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
+      ok = err == cudaSuccess;
+    }
+  }
+
+  free_device(d_X); free_device(d_Y); free_device(d_Z);
+  free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
+  return ok ? 0 : 1;
+}
+
+int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
+{
+  int n = count_points(ex);
+  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
+  dim3 block(256);
+  dim3 grid(div_up(n, static_cast<int>(block.x)));
+
+  double *d_chi = nullptr;
+  bool ok = copy_to_device(d_chi, chi, bytes);
+
+  if (ok)
+  {
+    void *args[] = {&n, &d_chi, &tinny};
+    ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
+  }
+
+  if (ok)
+  {
+    cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
+    ok = err == cudaSuccess;
+  }
+
+  free_device(d_chi);
+  return ok ? 0 : 1;
+}
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h
new file mode 100644
index 0000000..e45bcaa
--- /dev/null
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -0,0 +1,26 @@
+#ifndef BSSN_CUDA_OPS_H
+#define BSSN_CUDA_OPS_H
+
+int bssn_cuda_enforce_ga(int *ex,
+                         double *dxx, double *gxy, double *gxz,
+                         double *dyy, double *gyz, double *dzz,
+                         double *Axx, double *Axy, double *Axz,
+                         double *Ayy, double *Ayz, double *Azz);
+
+int bssn_cuda_rk4_boundary_var(int *ex, double dT,
+                               const double *X, const double *Y, const double *Z,
+                               double xmin, double ymin, double zmin,
+                               double xmax, double ymax, double zmax,
+                               const double *state0,
+                               const double *boundary_src,
+                               double *stage_data,
+                               double *rhs_accum,
+                               double propspeed,
+                               const double SoA[3],
+                               int symmetry,
+                               int lev,
+                               int rk_stage);
+
+int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
+
+#endif
diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C
new file mode 100644
index 0000000..e713a83
--- /dev/null
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -0,0 +1,364 @@
+#include "macrodef.h"
+
+#ifdef USE_GPU
+
+#include <cmath>
+#include <vector>
+
+#include "bssn_class.h"
+#include "bssn_cuda_ops.h"
+#include "bssn_gpu.h"
+#include "bssn_macro.h"
+#include "rungekutta4_rout.h"
+
+void bssn_class::Step_MainPath_GPU(int lev, int YN)
+{
+#ifdef WithShell
+#error "Step_MainPath_GPU currently supports Patch grids only."
+#endif
+
+  setpbh(BH_num, Porg0, Mass, BH_num_input);
+
+  const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+
+#if (MAPBH == 1)
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      for (int ith = 0; ith < 3; ith++)
+        Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
+      if (Symmetry > 0)
+        Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
+      if (Symmetry == 2)
+      {
+        Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
+        Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
+      }
+    }
+  }
+
+  if (lev == a_lev)
+    AnalysisStuff(lev, dT_lev);
+#endif
+
+#ifdef With_AHF
+  AH_Step_Find(lev, dT_lev);
+#endif
+
+  const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
+  (void)BB;
+  double ndeps = (lev < GH->movls) ? numepsb : numepss;
+  double TRK4 = PhysTime;
+  int iter_count = 0;
+  int pre = 0, cor = 1;
+  int ERROR = 0;
+
+  auto run_stage_on_block =
+      [&](Block *cg, Patch *patch, MyList<var> *state0_list,
+          MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
+          MyList<var> *rhs_list, int rk_stage) {
+        MyList<var> *varl0 = state0_list;
+        MyList<var> *varlb = boundary_src_list;
+        MyList<var> *varls = stage_data_list;
+        MyList<var> *varlr = rhs_list;
+
+        while (varl0)
+        {
+          if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
+                                         cg->X[0], cg->X[1], cg->X[2],
+                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
+                                         patch->bbox[3], patch->bbox[4], patch->bbox[5],
+                                         cg->fgfs[varl0->data->sgfn],
+                                         cg->fgfs[varlb->data->sgfn],
+                                         cg->fgfs[varls->data->sgfn],
+                                         cg->fgfs[varlr->data->sgfn],
+                                         varl0->data->propspeed,
+                                         varl0->data->SoA,
+                                         Symmetry, lev, rk_stage))
+          {
+            cerr << "GPU rk4/boundary failure: lev=" << lev
+                 << " rk_stage=" << rk_stage
+                 << " var=" << varl0->data->name
+                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+            break;
+          }
+          varl0 = varl0->next;
+          varlb = varlb->next;
+          varls = varls->next;
+          varlr = varlr->next;
+        }
+      };
+
+  MyList<Patch> *Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+#if (AGM == 0)
+        if (bssn_cuda_enforce_ga(cg->shape,
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]))
+        {
+          cerr << "GPU enforce_ga failure: lev=" << lev
+               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+               << cg->bbox[1] << ":" << cg->bbox[4] << ","
+               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+          ERROR = 1;
+        }
+#endif
+
+        if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
+          ERROR = 1;
+
+        run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
+
+        if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
+        {
+          cerr << "GPU lowerbound failure: lev=" << lev
+               << " rk_stage=" << iter_count
+               << " var=" << phi->name
+               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+               << cg->bbox[1] << ":" << cg->bbox[4] << ","
+               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+          ERROR = 1;
+        }
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+
+  MPI_Request err_req_pre;
+  {
+    int erh = ERROR;
+    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
+  }
+
+  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
+
+  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
+  if (ERROR)
+  {
+    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
+    if (myrank == 0)
+    {
+      if (ErrorMonitor->outfile)
+        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
+                              << ", lev = " << lev << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+#if (MAPBH == 0)
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
+      if (Symmetry > 0)
+        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
+      if (Symmetry == 2)
+      {
+        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
+        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
+      }
+    }
+  }
+
+  if (lev == a_lev)
+    AnalysisStuff(lev, dT_lev);
+#endif
+
+  for (iter_count = 1; iter_count < 4; iter_count++)
+  {
+    if (iter_count == 1 || iter_count == 3)
+      TRK4 += dT_lev / 2;
+
+    Pp = GH->PatL[lev];
+    while (Pp)
+    {
+      MyList<Block> *BP = Pp->data->blb;
+      while (BP)
+      {
+        Block *cg = BP->data;
+        if (myrank == cg->rank)
+        {
+#if (AGM == 0)
+          if (bssn_cuda_enforce_ga(cg->shape,
+                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                                   cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                                   cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
+          {
+            cerr << "GPU enforce_ga failure: lev=" << lev
+                 << " rk_stage=" << iter_count
+                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+#elif (AGM == 1)
+          if (iter_count == 3 &&
+              bssn_cuda_enforce_ga(cg->shape,
+                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                                   cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                                   cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
+          {
+            cerr << "GPU enforce_ga failure: lev=" << lev
+                 << " rk_stage=" << iter_count
+                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+#endif
+
+          if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
+            ERROR = 1;
+
+          run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
+
+          if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
+          {
+            cerr << "GPU lowerbound failure: lev=" << lev
+                 << " rk_stage=" << iter_count
+                 << " var=" << phi1->name
+                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+        }
+
+        if (BP == Pp->data->ble)
+          break;
+        BP = BP->next;
+      }
+      Pp = Pp->next;
+    }
+
+    MPI_Request err_req_cor;
+    {
+      int erh = ERROR;
+      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+    }
+
+    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+
+    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
+    if (ERROR)
+    {
+      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
+      if (myrank == 0)
+      {
+        if (ErrorMonitor->outfile)
+          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
+                                << " variables at t = " << PhysTime
+                                << ", lev = " << lev << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+#if (MAPBH == 0)
+    if (BH_num > 0 && lev == GH->levels - 1)
+    {
+      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
+      for (int ithBH = 0; ithBH < BH_num; ithBH++)
+      {
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
+        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
+        if (Symmetry > 0)
+          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
+        if (Symmetry == 2)
+        {
+          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
+          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
+        }
+      }
+    }
+#endif
+
+    if (iter_count < 3)
+    {
+      Pp = GH->PatL[lev];
+      while (Pp)
+      {
+        MyList<Block> *BP = Pp->data->blb;
+        while (BP)
+        {
+          BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
+          if (BP == Pp->data->ble)
+            break;
+          BP = BP->next;
+        }
+        Pp = Pp->next;
+      }
+
+#if (MAPBH == 0)
+      if (BH_num > 0 && lev == GH->levels - 1)
+      {
+        for (int ithBH = 0; ithBH < BH_num; ithBH++)
+        {
+          Porg[ithBH][0] = Porg1[ithBH][0];
+          Porg[ithBH][1] = Porg1[ithBH][1];
+          Porg[ithBH][2] = Porg1[ithBH][2];
+        }
+      }
+#endif
+    }
+  }
+
+#if (RPS == 0)
+  RestrictProlong(lev, YN, BB);
+#endif
+
+  Pp = GH->PatL[lev];
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      cg->swapList(StateList, SynchList_cor, myrank);
+      cg->swapList(OldStateList, SynchList_cor, myrank);
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      Porg0[ithBH][0] = Porg1[ithBH][0];
+      Porg0[ithBH][1] = Porg1[ithBH][1];
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+}
+
+#endif
diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu
index e67ae18..bf6920c 100644
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -18,12 +18,18 @@ using namespace std;
 #include <fstream>
 #endif
 
-void compare_result_gpu(int ftag1,double * datac,int data_num){
-	double * data = (double*)malloc(sizeof(double)*data_num);
-	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
-	compare_result(ftag1,data,data_num);
-	free(data);
-}
+void compare_result_gpu(int ftag1,double * datac,int data_num){
+#ifdef RESULT_CHECK
+	double * data = (double*)malloc(sizeof(double)*data_num);
+	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
+	compare_result(ftag1,data,data_num);
+	free(data);
+#else
+	(void)ftag1;
+	(void)datac;
+	(void)data_num;
+#endif
+}
 
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile
index 72b9cbd..14bfb32 100644
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
            Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
 	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
 	   
-C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-           cgh.o surface_integral.o ShellPatch.o\
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o \
-	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
+C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+	   NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
 
 F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
 
 TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 
 
-CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
+CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
 
 # file dependences
 $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc
index 331cff1..09dfe1d 100755
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -9,6 +9,7 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
 
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 
+LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
+
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
diff --git a/makefile_and_run.py b/makefile_and_run.py
index 5682476..ac86036 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@
 
 
 import AMSS_NCKU_Input as input_data
+import os
 import subprocess
 import time
 
@@ -57,6 +58,48 @@ BUILD_JOBS = 64
 ##################################################################
 
 
+##################################################################
+
+def prepare_gpu_runtime_env():
+    """
+    Create a user-private CUDA MPS environment for GPU runs.
+
+    On shared machines another user's daemon may already occupy the default
+    /tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
+    fail with cudaErrorMpsConnectionFailed.  Binding AMSS-NCKU to a private
+    pipe directory avoids cross-user interference.
+    """
+    env = os.environ.copy()
+
+    pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
+    log_dir  = env.get("CUDA_MPS_LOG_DIRECTORY",  f"/tmp/amss-ncku-mps-log-{os.getuid()}")
+
+    os.makedirs(pipe_dir, exist_ok=True)
+    os.makedirs(log_dir, exist_ok=True)
+
+    env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
+    env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
+
+    control_socket = os.path.join(pipe_dir, "control")
+    if not os.path.exists(control_socket):
+        start = subprocess.run(
+            ["nvidia-cuda-mps-control", "-d"],
+            env=env,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        if start.returncode != 0:
+            print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
+        else:
+            print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
+    else:
+        print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
+
+    return env
+
+##################################################################
+
+
 
 ##################################################################
 
@@ -146,16 +189,29 @@ def run_ABE():
 
     ## Define the command to run; cast other values to strings as needed
     
+    run_env = None
+
     if (input_data.GPU_Calculation == "no"):
         mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
         #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
         mpi_command_outfile = "ABE_out.log"
     elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        run_env = prepare_gpu_runtime_env()
+        if int(input_data.MPI_processes) == 1:
+            mpi_command = "./ABEGPU"
+        else:
+            mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
         mpi_command_outfile = "ABEGPU_out.log"
  
     ## Execute the MPI command and stream output
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    mpi_process = subprocess.Popen(
+        mpi_command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env=run_env,
+    )
 
     ## Write ABE run output to file while printing to stdout
     with open(mpi_command_outfile, 'w') as file0: