Integrate CUDA support into RK4 substep execution

2026-04-12 22:11:44 +08:00
parent 86a683de26
commit 4fa12a2009
4 changed files with 1161 additions and 279 deletions
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -21,9 +21,12 @@ using namespace std;
 #include "Ansorg.h"
 #include "fmisc.h"
 #include "Parallel.h"
-#include "bssn_class.h"
-#include "bssn_rhs.h"
-#include "initial_puncture.h"
+#include "bssn_class.h"
+#include "bssn_rhs.h"
+#if USE_CUDA_BSSN
+#include "bssn_rhs_cuda.h"
+#endif
+#include "initial_puncture.h"
 #include "enforce_algebra.h"
 #include "rungekutta4_rout.h"
 #include "sommerfeld_rout.h"
@@ -47,6 +50,35 @@ using namespace std;
 #define BSSN_ENABLE_MEM_USAGE_LOG 0
 #endif

+#if USE_CUDA_BSSN
+namespace {
+
+bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
+                          double **host_views,
+                          double *propspeeds = nullptr,
+                          double *soa_flat = nullptr)
+{
+  int idx = 0;
+  while (vars && idx < BSSN_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    if (propspeeds)
+      propspeeds[idx] = vars->data->propspeed;
+    if (soa_flat)
+    {
+      soa_flat[3 * idx + 0] = vars->data->SoA[0];
+      soa_flat[3 * idx + 1] = vars->data->SoA[1];
+      soa_flat[3 * idx + 2] = vars->data->SoA[2];
+    }
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
+}
+
+} // namespace
+#endif
+
 //================================================================================================

 // define bssn_class
@@ -3104,104 +3136,148 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif

-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                               cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
-                               cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
-                               cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
-                               cg->fgfs[Lap0->sgfn], 
-                               cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
-                               cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
-                               cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn],
-                               cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn],
-                               cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn],
-                               cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn],
-                               cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn],
-                               cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn],
-                               cg->fgfs[Lap_rhs->sgfn], 
-                               cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
-                               cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn],
-                               cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                               cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
-                               cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                               cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
-                               cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
-                               cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
-                               cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
-                               cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
-                               cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
-                               cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
-                               cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                               cg->fgfs[Cons_Ham->sgfn],
-                               cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                               cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                               Symmetry, lev, ndeps, pre))
-        {
-          cout << "find NaN in domain: (" 
-               << cg->bbox[0] << ":" << cg->bbox[3] << "," 
-               << cg->bbox[1] << ":" << cg->bbox[4] << ","
-               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-          ERROR = 1;
-        }
-
-        // rk4 substep and boundary
-        {
-          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
-          while (varl0)
-          {
-#if (SommerType == 0)
-#ifndef WithShell
-            if (lev == 0) // sommerfeld indeed
-              f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                   Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                   Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                   cg->fgfs[varlrhs->data->sgfn],
-                                   cg->fgfs[varl0->data->sgfn], 
-                                   varl0->data->propspeed, varl0->data->SoA,
-                                   Symmetry);
-
-#endif
-#endif
-            f_rungekutta4_rout(cg->shape, dT_lev, 
-                               cg->fgfs[varl0->data->sgfn], 
-                               cg->fgfs[varl->data->sgfn], 
-                               cg->fgfs[varlrhs->data->sgfn],
-                               iter_count);
-#ifndef WithShell
-            if (lev > 0) // fix BD point
-#endif
-              f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                dT_lev, 
-                                cg->fgfs[phi0->sgfn],
-                                cg->fgfs[Lap0->sgfn], 
-                                cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
-                                varl0->data->SoA,
-                                Symmetry, cor);
-
-#if (SommerType == 1)
-#warning "shell part still bam type"
-            if (lev == 0) // Shibata type sommerfeld
-              f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                dT_lev, 
-                                cg->fgfs[phi0->sgfn],
-                                cg->fgfs[Lap0->sgfn], 
-                                cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
-                                varl0->data->SoA,
-                                Symmetry, pre);
-#endif
-
-            varl0 = varl0->next;
-            varl = varl->next;
-            varlrhs = varlrhs->next;
-          }
-        }
-        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
+        bool used_gpu_substep = false;
+#if USE_CUDA_BSSN
+        {
+          double *state_in[BSSN_CUDA_STATE_COUNT];
+          double *state_out[BSSN_CUDA_STATE_COUNT];
+          double *matter[BSSN_CUDA_MATTER_COUNT] = {
+              cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
+              cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
+              cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]};
+          double propspeed[BSSN_CUDA_STATE_COUNT];
+          double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+          if (!fill_bssn_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
+              !fill_bssn_cuda_views(cg, SynchList_pre, state_out))
+          {
+            cout << "CUDA BSSN state list mismatch on predictor step" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          int apply_bam_bc = 0;
+#if (SommerType == 0)
+#ifndef WithShell
+          apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+          if (bssn_cuda_rk4_substep(cg,
+                                    cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                    state_in, state_out, matter,
+                                    propspeed, soa_flat, Pp->data->bbox,
+                                    dT_lev, TRK4, iter_count, apply_bam_bc,
+                                    Symmetry, lev, ndeps, pre))
+          {
+            cout << "CUDA predictor substep failed in domain: ("
+                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+          used_gpu_substep = true;
+        }
+#endif
+        if (!used_gpu_substep)
+        {
+          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
+                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
+                                 cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
+                                 cg->fgfs[Lap0->sgfn], 
+                                 cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
+                                 cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
+                                 cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn],
+                                 cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn],
+                                 cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn],
+                                 cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn],
+                                 cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn],
+                                 cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn],
+                                 cg->fgfs[Lap_rhs->sgfn], 
+                                 cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
+                                 cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn],
+                                 cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
+                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                 cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
+                                 cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
+                                 cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
+                                 cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
+                                 cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
+                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
+                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
+                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
+                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 Symmetry, lev, ndeps, pre))
+          {
+            cout << "find NaN in domain: (" 
+                 << cg->bbox[0] << ":" << cg->bbox[3] << "," 
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+        }
+
+        // rk4 substep boundary fix
+        {
+          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
+          while (varl0)
+          {
+#if !USE_CUDA_BSSN
+#if (SommerType == 0)
+#ifndef WithShell
+            if (lev == 0) // sommerfeld indeed
+              f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                   Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                   Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                   cg->fgfs[varlrhs->data->sgfn],
+                                   cg->fgfs[varl0->data->sgfn], 
+                                   varl0->data->propspeed, varl0->data->SoA,
+                                   Symmetry);
+
+#endif
+#endif
+            f_rungekutta4_rout(cg->shape, dT_lev, 
+                               cg->fgfs[varl0->data->sgfn], 
+                               cg->fgfs[varl->data->sgfn], 
+                               cg->fgfs[varlrhs->data->sgfn],
+                               iter_count);
+#endif
+#ifndef WithShell
+            if (lev > 0) // fix BD point
+#endif
+              f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                dT_lev, 
+                                cg->fgfs[phi0->sgfn],
+                                cg->fgfs[Lap0->sgfn], 
+                                cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
+                                varl0->data->SoA,
+                                Symmetry, cor);
+
+#if (SommerType == 1)
+#warning "shell part still bam type"
+            if (lev == 0) // Shibata type sommerfeld
+              f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                dT_lev, 
+                                cg->fgfs[phi0->sgfn],
+                                cg->fgfs[Lap0->sgfn], 
+                                cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], 
+                                varl0->data->SoA,
+                                Symmetry, pre);
+#endif
+
+            varl0 = varl0->next;
+            varl = varl->next;
+            varlrhs = varlrhs->next;
+          }
+        }
+        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
      }
      if (BP == Pp->data->ble)
        break;
@@ -3469,102 +3545,148 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif

-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
-                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                                 cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-                                 cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
-                                 cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
-                                 cg->fgfs[Lap->sgfn], 
-                                 cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn],
-                                 cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn],
-                                 cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn],
-                                 cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn],
-                                 cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn],
-                                 cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn],
-                                 cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn],
-                                 cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn],
-                                 cg->fgfs[Lap1->sgfn], 
-                                 cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn],
-                                 cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn],
-                                 cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
-                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                 cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
-                                 cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
-                                 cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
-                                 cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
-                                 cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
-                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
-                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
-                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, cor))
-          {
-            cout << "find NaN in domain: (" 
-                 << cg->bbox[0] << ":" << cg->bbox[3] << "," 
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-          // rk4 substep and boundary
-          {
-            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here
-            while (varl0)
-            {
-#if (SommerType == 0)
-#ifndef WithShell
-              if (lev == 0) // sommerfeld indeed
-                f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                     Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                     Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                     cg->fgfs[varl1->data->sgfn],
-                                     cg->fgfs[varl->data->sgfn], varl0->data->propspeed, varl0->data->SoA,
-                                     Symmetry);
-#endif
-#endif
-              f_rungekutta4_rout(cg->shape, dT_lev, 
-                                 cg->fgfs[varl0->data->sgfn], 
-                                 cg->fgfs[varl1->data->sgfn], 
-                                 cg->fgfs[varlrhs->data->sgfn],
-                                 iter_count);
-
-#ifndef WithShell
-              if (lev > 0) // fix BD point
-#endif
-                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                  dT_lev, 
-                                  cg->fgfs[phi0->sgfn],
-                                  cg->fgfs[Lap0->sgfn], 
-                                  cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], 
-                                  varl0->data->SoA,
-                                  Symmetry, cor);
-
-#if (SommerType == 1)
-              if (lev == 1) // shibata type sommerfeld
-                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
-                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
-                                  dT_lev, 
-                                  cg->fgfs[phi0->sgfn],
-                                  cg->fgfs[Lap0->sgfn], 
-                                  cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn], 
-                                  varl0->data->SoA,
-                                  Symmetry, cor);
-#endif
-
-              varl0 = varl0->next;
-              varl = varl->next;
-              varl1 = varl1->next;
-              varlrhs = varlrhs->next;
-            }
-          }
-          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
+          bool used_gpu_substep = false;
+#if USE_CUDA_BSSN
+          {
+            double *state_in[BSSN_CUDA_STATE_COUNT];
+            double *state_out[BSSN_CUDA_STATE_COUNT];
+            double *matter[BSSN_CUDA_MATTER_COUNT] = {
+                cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
+                cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
+                cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]};
+            double propspeed[BSSN_CUDA_STATE_COUNT];
+            double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+            if (!fill_bssn_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
+                !fill_bssn_cuda_views(cg, SynchList_cor, state_out))
+            {
+              cout << "CUDA BSSN state list mismatch on corrector step" << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            int apply_bam_bc = 0;
+#if (SommerType == 0)
+#ifndef WithShell
+            apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+            if (bssn_cuda_rk4_substep(cg,
+                                      cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                      state_in, state_out, matter,
+                                      propspeed, soa_flat, Pp->data->bbox,
+                                      dT_lev, TRK4, iter_count, apply_bam_bc,
+                                      Symmetry, lev, ndeps, cor))
+            {
+              cout << "CUDA corrector substep failed in domain: ("
+                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+            }
+            used_gpu_substep = true;
+          }
+#endif
+          if (!used_gpu_substep)
+          {
+            if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                   cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
+                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                                   cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                                   cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                                   cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
+                                   cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
+                                   cg->fgfs[Lap->sgfn], 
+                                   cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn],
+                                   cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn],
+                                   cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn],
+                                   cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn],
+                                   cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn],
+                                   cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn],
+                                   cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn],
+                                   cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn],
+                                   cg->fgfs[Lap1->sgfn], 
+                                   cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn],
+                                   cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn],
+                                   cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
+                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                   cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
+                                   cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
+                                   cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
+                                   cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
+                                   cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
+                                   cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
+                                   cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
+                                   cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
+                                   cg->fgfs[Cons_Ham->sgfn],
+                                   cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                   cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                   Symmetry, lev, ndeps, cor))
+            {
+              cout << "find NaN in domain: (" 
+                   << cg->bbox[0] << ":" << cg->bbox[3] << "," 
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+            }
+          }
+          // rk4 substep boundary fix
+          {
+            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
+            // we do not check the correspondence here
+            while (varl0)
+            {
+#if !USE_CUDA_BSSN
+#if (SommerType == 0)
+#ifndef WithShell
+              if (lev == 0) // sommerfeld indeed
+                f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                     Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                     Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                     cg->fgfs[varl1->data->sgfn],
+                                     cg->fgfs[varl->data->sgfn], 
+                                     varl0->data->propspeed, varl0->data->SoA,
+                                     Symmetry);
+#endif
+#endif
+              f_rungekutta4_rout(cg->shape, dT_lev, 
+                                 cg->fgfs[varl0->data->sgfn], 
+                                 cg->fgfs[varl1->data->sgfn], 
+                                 cg->fgfs[varlrhs->data->sgfn],
+                                 iter_count);
+#endif
+
+#ifndef WithShell
+              if (lev > 0) // fix BD point
+#endif
+                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                  dT_lev, 
+                                  cg->fgfs[phi0->sgfn],
+                                  cg->fgfs[Lap0->sgfn], 
+                                  cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], 
+                                  varl0->data->SoA,
+                                  Symmetry, cor);
+
+#if (SommerType == 1)
+              if (lev == 1) // shibata type sommerfeld
+                f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                  Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], 
+                                  Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
+                                  dT_lev, 
+                                  cg->fgfs[phi0->sgfn],
+                                  cg->fgfs[Lap0->sgfn], 
+                                  cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn], 
+                                  varl0->data->SoA,
+                                  Symmetry, cor);
+#endif
+
+              varl0 = varl0->next;
+              varl = varl->next;
+              varl1 = varl1->next;
+              varlrhs = varlrhs->next;
+            }
+          }
+          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
        }
        if (BP == Pp->data->ble)
          break;
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -5,10 +5,12 @@
 * Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
 */

+#include <array>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
 #include <cstring>
+#include <unordered_map>
 #include <cuda_runtime.h>
 #include "macrodef.h"
 #include "bssn_rhs.h"
@@ -219,6 +221,40 @@ static const int STAGE_SLOT_COUNT =
        ? H2D_INPUT_SLOT_COUNT
        : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);

+static constexpr int BSSN_STATE_COUNT = 24;
+static constexpr int BSSN_MATTER_COUNT = 10;
+
+static const int k_state_input_slots[BSSN_STATE_COUNT] = {
+    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
+    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
+    S_Gamx, S_Gamy, S_Gamz,
+    S_Lap, S_betax, S_betay, S_betaz,
+    S_dtSfx, S_dtSfy, S_dtSfz
+};
+
+static const int k_state_rhs_slots[BSSN_STATE_COUNT] = {
+    S_chi_rhs, S_trK_rhs,
+    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
+    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
+    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
+    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
+    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs
+};
+
+static const int k_matter_slots[BSSN_MATTER_COUNT] = {
+    S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
+};
+
+struct StepContext {
+    double *d_state0_mem;
+    double *d_accum_mem;
+    std::array<double *, BSSN_STATE_COUNT> d_state0;
+    std::array<double *, BSSN_STATE_COUNT> d_accum;
+    size_t cap_all;
+};
+
+static std::unordered_map<void *, StepContext> g_step_ctx;
+
 static void ensure_gpu_buffers(int nx, int ny, int nz) {
    size_t all = (size_t)nx * ny * nz;
    size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
@@ -269,6 +305,38 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
    g_buf.prev_ny = ny;
    g_buf.prev_nz = nz;
 }
+
+static StepContext &ensure_step_ctx(void *block_tag, size_t all)
+{
+    StepContext &ctx = g_step_ctx[block_tag];
+    if (ctx.cap_all < all) {
+        if (ctx.d_state0_mem) {
+            cudaFree(ctx.d_state0_mem);
+            ctx.d_state0_mem = nullptr;
+        }
+        if (ctx.d_accum_mem) {
+            cudaFree(ctx.d_accum_mem);
+            ctx.d_accum_mem = nullptr;
+        }
+        CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+        CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+        ctx.cap_all = all;
+    }
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
+        ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
+    }
+    return ctx;
+}
+
+static void release_step_ctx(void *block_tag)
+{
+    auto it = g_step_ctx.find(block_tag);
+    if (it == g_step_ctx.end()) return;
+    if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
+    if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
+    g_step_ctx.erase(it);
+}

 /* ================================================================== */
 /*  A. Symmetry boundary kernels (ord=2 and ord=3)                     */
@@ -937,6 +1005,177 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
        kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
    }
 }
+
+__global__ void kern_rk4_finalize(const double * __restrict__ f0,
+                                  double * __restrict__ frhs,
+                                  double * __restrict__ accum,
+                                  double dT,
+                                  int rk4_stage)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < d_gp.all;
+         i += blockDim.x * gridDim.x)
+    {
+        const double rhs = frhs[i];
+        switch (rk4_stage) {
+        case 0:
+            accum[i] = rhs;
+            frhs[i] = f0[i] + 0.5 * dT * rhs;
+            break;
+        case 1:
+            accum[i] += 2.0 * rhs;
+            frhs[i] = f0[i] + 0.5 * dT * rhs;
+            break;
+        case 2:
+            accum[i] += 2.0 * rhs;
+            frhs[i] = f0[i] + dT * rhs;
+            break;
+        default:
+            frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs);
+            break;
+        }
+    }
+}
+
+__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < d_gp.all;
+         i += blockDim.x * gridDim.x)
+    {
+        if (chi[i] < tinny) chi[i] = tinny;
+    }
+}
+
+enum SommerFace {
+    FACE_XMAX = 0,
+    FACE_YMAX = 1,
+    FACE_ZMAX = 2,
+    FACE_XMIN = 3,
+    FACE_YMIN = 4,
+    FACE_ZMIN = 5
+};
+
+__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh,
+                                         double * __restrict__ f_rhs,
+                                         int face,
+                                         double velocity,
+                                         double x0,
+                                         double y0,
+                                         double z0)
+{
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
+    const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1;
+    const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1;
+    const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1;
+    const int imax = nx, jmax = ny, kmax = nz;
+    const int plane_count =
+        (face == FACE_XMAX || face == FACE_XMIN) ? ny * nz :
+        (face == FACE_YMAX || face == FACE_YMIN) ? nx * nz :
+                                                   nx * ny;
+
+    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
+         tid < plane_count;
+         tid += blockDim.x * gridDim.x)
+    {
+        int i0 = 0, j0 = 0, k0 = 0;
+        if (face == FACE_XMAX || face == FACE_XMIN) {
+            j0 = tid % ny;
+            k0 = tid / ny;
+            i0 = (face == FACE_XMAX) ? (nx - 1) : 0;
+        } else if (face == FACE_YMAX || face == FACE_YMIN) {
+            i0 = tid % nx;
+            k0 = tid / nx;
+            j0 = (face == FACE_YMAX) ? (ny - 1) : 0;
+        } else {
+            i0 = tid % nx;
+            j0 = tid / nx;
+            k0 = (face == FACE_ZMAX) ? (nz - 1) : 0;
+        }
+
+        const int iF = i0 + 1;
+        const int jF = j0 + 1;
+        const int kF = k0 + 1;
+        const int p = idx_ex_d(i0, j0, k0);
+
+        const double x = x0 + i0 * d_gp.dX;
+        const double y = y0 + j0 * d_gp.dY;
+        const double z = z0 + k0 * d_gp.dZ;
+        const double r = sqrt(x * x + y * y + z * z);
+        if (r == 0.0) continue;
+
+        const double wx = velocity * x / r;
+        const double wy = velocity * y / r;
+        const double wz = velocity * z / r;
+
+        double fx = 0.0, fy = 0.0, fz = 0.0;
+        if (wx > 0.0) {
+            if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
+            else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
+            else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+        } else if (wx < 0.0) {
+            if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+            else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
+            else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
+        }
+
+        if (wy > 0.0) {
+            if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
+            else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
+            else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+        } else if (wy < 0.0) {
+            if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+            else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
+            else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
+        }
+
+        if (wz > 0.0) {
+            if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
+            else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
+            else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+        } else if (wz < 0.0) {
+            if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
+            else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
+            else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
+        }
+
+        f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r;
+    }
+}
+
+static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs,
+                                   double velocity,
+                                   double SoA0, double SoA1, double SoA2,
+                                   double *X, double *Y, double *Z,
+                                   const double *bbox,
+                                   int Symmetry)
+{
+    if (velocity == 0.0) return;
+    const int nx = g_buf.prev_nx;
+    const int ny = g_buf.prev_ny;
+    const int nz = g_buf.prev_nz;
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+    const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX;
+    const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY;
+    const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ;
+    const bool touch_xmin = fabs(X[0] - bbox[0]) < dX &&
+                            !(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5);
+    const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY &&
+                            !(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5);
+    const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ &&
+                            !(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5);
+
+    const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2);
+    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2);
+    if (touch_xmax) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]);
+    if (touch_ymax) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]);
+    if (touch_zmax) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]);
+    if (touch_xmin) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]);
+    if (touch_ymin) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]);
+    if (touch_zmin) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]);
+}

 /* ================================================================== */
 /*  C. Point-wise computation kernels                                  */
@@ -2075,14 +2314,435 @@ void kern_phase18_constraints(
                     +uxy*my_xz+uxz*mz_xz+uyz*mz_yz
                     - F2o3*Kz[i] - F8*PI_V*Sz_m[i];
    }
-}
+}
+
+static void setup_grid_params(int *ex,
+                              double *X, double *Y, double *Z,
+                              int Symmetry, double eps, int co)
+{
+    const int nx = ex[0];
+    const int ny = ex[1];
+    const int nz = ex[2];
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+    const int NO_SYMM = 0;
+    const int EQ_SYMM = 1;
+
+    ensure_gpu_buffers(nx, ny, nz);
+
+    GridParams gp = {};
+    gp.ex[0] = nx;
+    gp.ex[1] = ny;
+    gp.ex[2] = nz;
+    gp.all = nx * ny * nz;
+    gp.dX = dX;
+    gp.dY = dY;
+    gp.dZ = dZ;
+    gp.d12dx = 1.0 / (12.0 * dX);
+    gp.d12dy = 1.0 / (12.0 * dY);
+    gp.d12dz = 1.0 / (12.0 * dZ);
+    gp.d2dx = 1.0 / (2.0 * dX);
+    gp.d2dy = 1.0 / (2.0 * dY);
+    gp.d2dz = 1.0 / (2.0 * dZ);
+    gp.Fdxdx = 1.0 / (12.0 * dX * dX);
+    gp.Fdydy = 1.0 / (12.0 * dY * dY);
+    gp.Fdzdz = 1.0 / (12.0 * dZ * dZ);
+    gp.Sdxdx = 1.0 / (dX * dX);
+    gp.Sdydy = 1.0 / (dY * dY);
+    gp.Sdzdz = 1.0 / (dZ * dZ);
+    gp.Fdxdy = 1.0 / (144.0 * dX * dY);
+    gp.Fdxdz = 1.0 / (144.0 * dX * dZ);
+    gp.Fdydz = 1.0 / (144.0 * dY * dZ);
+    gp.Sdxdy = 0.25 / (dX * dY);
+    gp.Sdxdz = 0.25 / (dX * dZ);
+    gp.Sdydz = 0.25 / (dY * dZ);
+    gp.iminF = 1;
+    gp.jminF = 1;
+    gp.kminF = 1;
+    gp.imaxF = nx;
+    gp.jmaxF = ny;
+    gp.kmaxF = nz;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
+    gp.iminF3 = 1;
+    gp.jminF3 = 1;
+    gp.kminF3 = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
+    gp.Symmetry = Symmetry;
+    gp.eps = eps;
+    gp.co = co;
+    gp.fh2_nx = nx + 2;
+    gp.fh2_ny = ny + 2;
+    gp.fh2_nz = nz + 2;
+    gp.fh3_nx = nx + 3;
+    gp.fh3_ny = ny + 3;
+    gp.fh3_nz = nz + 3;
+    CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
+}
+
+static void upload_state_and_matter(double **state_host,
+                                    double **matter_host,
+                                    size_t all)
+{
+    static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
+                  "state + matter upload must match contiguous input slots");
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
+    }
+    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
+        std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
+                    matter_host[i], bytes);
+    }
+    CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
+                          (size_t)H2D_INPUT_SLOT_COUNT * bytes,
+                          cudaMemcpyHostToDevice));
+}
+
+static void launch_rhs_pipeline(int all, double eps, int co)
+{
+    const double SYM = 1.0;
+    const double ANTI = -1.0;
+
+    #define D(s) g_buf.slot[s]
+
+    kern_phase1_prep<<<grid(all),BLK>>>(
+        D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
+        D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
+
+    gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
+    gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
+    gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
+    gpu_fderivs(D(S_chi),   D(S_chix),D(S_chiy),D(S_chiz),       SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_dxx),   D(S_gxxx),D(S_gxxy),D(S_gxxz),       SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_gxy),   D(S_gxyx),D(S_gxyy),D(S_gxyz),       ANTI,ANTI,SYM, all);
+    gpu_fderivs(D(S_gxz),   D(S_gxzx),D(S_gxzy),D(S_gxzz),       ANTI,SYM,ANTI, all);
+    gpu_fderivs(D(S_dyy),   D(S_gyyx),D(S_gyyy),D(S_gyyz),       SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_gyz),   D(S_gyzx),D(S_gyzy),D(S_gyzz),       SYM,ANTI,ANTI, all);
+    gpu_fderivs(D(S_dzz),   D(S_gzzx),D(S_gzzy),D(S_gzzz),       SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_Lap),   D(S_Lapx),D(S_Lapy),D(S_Lapz),       SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_trK),   D(S_Kx),D(S_Ky),D(S_Kz),             SYM,SYM,SYM, all);
+
+    kern_phase2_metric_rhs<<<grid(all),BLK>>>(
+        D(S_alpn1), D(S_chin1),
+        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
+        D(S_trK),
+        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
+        D(S_betaxx), D(S_betaxy), D(S_betaxz),
+        D(S_betayx), D(S_betayy), D(S_betayz),
+        D(S_betazx), D(S_betazy), D(S_betazz),
+        D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
+        D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));
+
+    kern_phase2_inverse<<<grid(all),BLK>>>(
+        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz));
+
+    if (co == 0) {
+        kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
+            D(S_Gamx), D(S_Gamy), D(S_Gamz),
+            D(S_gupxx), D(S_gupxy), D(S_gupxz),
+            D(S_gupyy), D(S_gupyz), D(S_gupzz),
+            D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
+            D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
+            D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
+            D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
+    }
+
+    kern_phase4_christoffel<<<grid(all),BLK>>>(
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
+        D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
+        D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
+        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
+        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
+        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
+        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
+        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
+        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));
+
+    kern_phase5_raise_A<<<grid(all),BLK>>>(
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
+        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz));
+
+    kern_phase6_gamma_rhs_part1<<<grid(all),BLK>>>(
+        D(S_Lapx), D(S_Lapy), D(S_Lapz),
+        D(S_alpn1), D(S_chin1),
+        D(S_chix), D(S_chiy), D(S_chiz),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_Kx), D(S_Ky), D(S_Kz),
+        D(S_Sx), D(S_Sy), D(S_Sz),
+        D(S_Rxx), D(S_Rxy), D(S_Rxz),
+        D(S_Ryy), D(S_Ryz), D(S_Rzz),
+        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
+        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
+        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
+        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
+        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
+        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
+        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
+
+    gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
+                 D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
+    gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
+                 D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
+    gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
+                 D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
+    gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
+    gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
+    gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
+
+    kern_phase8_gamma_rhs_part2<<<grid(all),BLK>>>(
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
+        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
+        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
+        D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
+        D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
+        D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
+        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
+        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
+        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
+        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
+        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
+        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
+        D(S_betaxx),D(S_betaxy),D(S_betaxz),
+        D(S_betayx),D(S_betayy),D(S_betayz),
+        D(S_betazx),D(S_betazy),D(S_betazz),
+        D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
+        D(S_Gamxa),D(S_Gamya),D(S_Gamza));
+
+    kern_phase9_christoffel_contract<<<grid(all),BLK>>>(
+        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
+        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
+        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
+        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
+        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
+        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
+        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
+        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
+        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
+        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
+
+    gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
+
+    gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
+
+    gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
+
+    gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
+
+    gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
+
+    gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
+    kern_phase10_ricci_contract<<<grid(all),BLK>>>(
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
+
+    kern_phase11_ricci_diag<<<grid(all),BLK>>>(
+        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
+        D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
+        D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
+        D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
+        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
+        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
+        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
+        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
+        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
+        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
+        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
+        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
+        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
+        D(S_Rxx),D(S_Ryy),D(S_Rzz));
+
+    kern_phase11_ricci_offdiag<<<grid(all),BLK>>>(
+        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
+        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
+        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
+        D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
+        D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
+        D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
+        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
+        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
+        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
+        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
+        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
+        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
+        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
+        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
+        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
+        D(S_Rxy),D(S_Rxz),D(S_Ryz));
+
+    gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
+
+    kern_phase13_chi_correction<<<grid(all),BLK>>>(
+        D(S_chin1),
+        D(S_chix), D(S_chiy), D(S_chiz),
+        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
+        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
+        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
+        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
+        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
+        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
+        D(S_fxx), D(S_fxy), D(S_fxz),
+        D(S_fyy), D(S_fyz), D(S_fzz),
+        D(S_Rxx), D(S_Rxy), D(S_Rxz),
+        D(S_Ryy), D(S_Ryz), D(S_Rzz));
+
+    gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
+                 D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
+    gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
+                SYM,SYM,SYM, all);
+
+    kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
+        D(S_alpn1), D(S_chin1),
+        D(S_chix), D(S_chiy), D(S_chiz),
+        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz),
+        D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_trK),
+        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
+        D(S_Lapx), D(S_Lapy), D(S_Lapz),
+        D(S_betaxx), D(S_betaxy), D(S_betaxz),
+        D(S_betayx), D(S_betayy), D(S_betayz),
+        D(S_betazx), D(S_betazy), D(S_betazz),
+        D(S_rho),
+        D(S_Sx), D(S_Sy), D(S_Sz),
+        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
+        D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
+        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
+        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
+        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
+        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
+        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
+        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
+        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
+        D(S_fxx), D(S_fxy), D(S_fxz),
+        D(S_fyy), D(S_fyz), D(S_fzz),
+        D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
+        D(S_trK_rhs),
+        D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
+        D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
+        D(S_Lap_rhs),
+        D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
+        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
+        D(S_f_arr), D(S_S_arr));
+
+    gpu_lopsided_kodis(D(S_gxx),   D(S_dxx),   D(S_gxx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Gamz),  D(S_Gamz),  D(S_Gamz_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,   eps, all);
+    gpu_lopsided_kodis(D(S_gxy),   D(S_gxy),   D(S_gxy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
+    gpu_lopsided_kodis(D(S_Lap),   D(S_Lap),   D(S_Lap_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_gxz),   D(S_gxz),   D(S_gxz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
+    gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_gyy),   D(S_dyy),   D(S_gyy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_gyz),   D(S_gyz),   D(S_gyz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
+    gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
+    gpu_lopsided_kodis(D(S_gzz),   D(S_dzz),   D(S_gzz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Axx),   D(S_Axx),   D(S_Axx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Axy),   D(S_Axy),   D(S_Axy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
+    gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
+    gpu_lopsided_kodis(D(S_Axz),   D(S_Axz),   D(S_Axz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
+    gpu_lopsided_kodis(D(S_Ayy),   D(S_Ayy),   D(S_Ayy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Ayz),   D(S_Ayz),   D(S_Ayz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
+    gpu_lopsided_kodis(D(S_Azz),   D(S_Azz),   D(S_Azz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_chi),   D(S_chi),   D(S_chi_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_trK),   D(S_trK),   D(S_trK_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Gamx),  D(S_Gamx),  D(S_Gamx_rhs),  D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
+    gpu_lopsided_kodis(D(S_Gamy),  D(S_Gamy),  D(S_Gamy_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
+
+    if (co == 0) {
+        gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM,      all);
+        gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM,     all);
+        gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI,     all);
+        gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM,       all);
+        gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI,     all);
+        gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM,       all);
+
+        kern_phase18_constraints<<<grid(all),BLK>>>(
+            D(S_chin1),
+            D(S_chix), D(S_chiy), D(S_chiz),
+            D(S_gupxx), D(S_gupxy), D(S_gupxz),
+            D(S_gupyy), D(S_gupyz), D(S_gupzz),
+            D(S_trK),
+            D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
+            D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
+            D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
+            D(S_Kx), D(S_Ky), D(S_Kz),
+            D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
+            D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
+            D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
+            D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
+            D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
+            D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
+            D(S_gxxx), D(S_gxxy), D(S_gxxz),
+            D(S_gxyx), D(S_gxyy), D(S_gxyz),
+            D(S_gxzx), D(S_gxzy), D(S_gxzz),
+            D(S_gyyx), D(S_gyyy), D(S_gyyz),
+            D(S_gyzx), D(S_gyzy), D(S_gyzz),
+            D(S_gzzx), D(S_gzzy), D(S_gzzz),
+            D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
+    }
+
+    #undef D
+}
+
+static void download_state_outputs(double **state_host_out, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
+                          (size_t)BSSN_STATE_COUNT * bytes,
+                          cudaMemcpyDeviceToHost));
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
+    }
+}
+
+/* ================================================================== */
+/*  Main host function — drop-in replacement for bssn_rhs_c.C          */
+/* ================================================================== */

-/* ================================================================== */
-/*  Main host function — drop-in replacement for bssn_rhs_c.C          */
-/* ================================================================== */
-
-extern "C"
-int f_compute_rhs_bssn(int *ex, double &T,
+extern "C"
+int f_compute_rhs_bssn(int *ex, double &T,
    double *X, double *Y, double *Z,
    double *chi, double *trK,
    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
@@ -2563,3 +3223,76 @@ int f_compute_rhs_bssn(int *ex, double &T,
    #undef D
    return 0;
 }
+
+extern "C"
+int bssn_cuda_rk4_substep(void *block_tag,
+                          int *ex, double *X, double *Y, double *Z,
+                          double **state_host_in,
+                          double **state_host_out,
+                          double **matter_host,
+                          const double *propspeed,
+                          const double *soa_flat,
+                          const double *bbox,
+                          double &dT,
+                          double &T,
+                          int &RK4,
+                          int &apply_bam_bc,
+                          int &Symmetry,
+                          int &Lev,
+                          double &eps,
+                          int &co)
+{
+    (void)T;
+    (void)Lev;
+
+    if (RK4 < 0 || RK4 > 3) return 1;
+
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    upload_state_and_matter(state_host_in, matter_host, all);
+
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    if (RK4 == 0) {
+        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
+                              (size_t)BSSN_STATE_COUNT * bytes,
+                              cudaMemcpyDeviceToDevice));
+    }
+
+    launch_rhs_pipeline((int)all, eps, co);
+
+    if (apply_bam_bc) {
+        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+            gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
+                                   g_buf.slot[k_state_rhs_slots[i]],
+                                   propspeed[i],
+                                   soa_flat[3 * i + 0],
+                                   soa_flat[3 * i + 1],
+                                   soa_flat[3 * i + 2],
+                                   X, Y, Z, bbox, Symmetry);
+        }
+    }
+
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
+                                              g_buf.slot[k_state_rhs_slots[i]],
+                                              ctx.d_accum[i],
+                                              dT,
+                                              RK4);
+    }
+
+    download_state_outputs(state_host_out, all);
+    return 0;
+}
+
+extern "C"
+void bssn_cuda_release_step_ctx(void *block_tag)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    release_step_ctx(block_tag);
+}
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -1,13 +1,18 @@
 #ifndef BSSN_RHS_CUDA_H
 #define BSSN_RHS_CUDA_H

-#ifdef __cplusplus
-extern "C" {
-#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+    BSSN_CUDA_STATE_COUNT = 24,
+    BSSN_CUDA_MATTER_COUNT = 10
+};

-int f_compute_rhs_bssn(int *ex, double &T,
-                       double *X, double *Y, double *Z,
-                       double *chi, double *trK,
+int f_compute_rhs_bssn(int *ex, double &T,
+                       double *X, double *Y, double *Z,
+                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
@@ -26,11 +31,30 @@ int f_compute_rhs_bssn(int *ex, double &T,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
-                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
-                       int &Symmetry, int &Lev, double &eps, int &co);
-
-#ifdef __cplusplus
-}
+                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
+                       int &Symmetry, int &Lev, double &eps, int &co);
+
+int bssn_cuda_rk4_substep(void *block_tag,
+                          int *ex, double *X, double *Y, double *Z,
+                          double **state_host_in,
+                          double **state_host_out,
+                          double **matter_host,
+                          const double *propspeed,
+                          const double *soa_flat,
+                          const double *bbox,
+                          double &dT,
+                          double &T,
+                          int &RK4,
+                          int &apply_bam_bc,
+                          int &Symmetry,
+                          int &Lev,
+                          double &eps,
+                          int &co);
+
+void bssn_cuda_release_step_ctx(void *block_tag);
+
+#ifdef __cplusplus
+}
 #endif

 #endif
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -42,16 +42,16 @@ endif
 .for.o:
 	$(f77) -c $< -o $@

-.cu.o:
-	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-
-# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
-bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
-	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-
-# C rewrite of BSSN RHS kernel and helpers
-bssn_rhs_c.o: bssn_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+.cu.o:
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
+
+# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
+bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
+
+# C rewrite of BSSN RHS kernel and helpers
+bssn_rhs_c.o: bssn_rhs_c.C
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fderivs_c.o: fderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -83,46 +83,49 @@ TwoPunctures.o: TwoPunctures.C
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@

-# Input files
-
-## CUDA BSSN RHS switch
-##   1 : use the rewritten CUDA bssn_rhs backend
-##   0 : keep the normal CPU/Fortran selection below
-USE_CUDA_BSSN ?= 0
-
-## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
-ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
-CFILES_CPU =
-else
-# C++ mode (default): C rewrite of bssn_rhs and helper kernels
-CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
-endif
-
-CFILES_CUDA_BSSN = bssn_rhs_cuda.o
-
-ifeq ($(USE_CUDA_BSSN),1)
-CFILES = $(CFILES_CUDA_BSSN)
-else
-CFILES = $(CFILES_CPU)
-endif
-
-## RK4 kernel switch (independent from USE_CXX_KERNELS)
-ifeq ($(USE_CXX_RK4),1)
-RK4_C_OBJ = rungekutta4_rout_c.o
-RK4_F90_OBJ =
-else
-RK4_C_OBJ =
-RK4_F90_OBJ = rungekutta4_rout.o
-endif
-
-CFILES += $(RK4_C_OBJ)
-ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ)
-
-ABE_LDLIBS = $(LDLIBS)
-ifeq ($(USE_CUDA_BSSN),1)
-ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
-endif
+# Input files
+
+## CUDA BSSN RHS switch
+##   1 : use the rewritten CUDA bssn_rhs backend
+##   0 : keep the normal CPU/Fortran selection below
+USE_CUDA_BSSN ?= 1
+
+CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
+CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
+
+## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
+ifeq ($(USE_CXX_KERNELS),0)
+# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
+CFILES_CPU =
+else
+# C++ mode (default): C rewrite of bssn_rhs and helper kernels
+CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
+endif
+
+CFILES_CUDA_BSSN = bssn_rhs_cuda.o
+
+ifeq ($(USE_CUDA_BSSN),1)
+CFILES = $(CFILES_CUDA_BSSN)
+else
+CFILES = $(CFILES_CPU)
+endif
+
+## RK4 kernel switch (independent from USE_CXX_KERNELS)
+ifeq ($(USE_CXX_RK4),1)
+RK4_C_OBJ = rungekutta4_rout_c.o
+RK4_F90_OBJ =
+else
+RK4_C_OBJ =
+RK4_F90_OBJ = rungekutta4_rout.o
+endif
+
+CFILES += $(RK4_C_OBJ)
+ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ)
+
+ABE_LDLIBS = $(LDLIBS)
+ifeq ($(USE_CUDA_BSSN),1)
+ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
+endif

 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -171,8 +174,8 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o

 #CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

-# file dependences
-$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+# file dependences
+$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -195,7 +198,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
             
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h

-$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h

 TwoPunctureFILES: TwoPunctures.h

@@ -203,18 +206,18 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h

 misc.o : zbesh.o

-# projects
-ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
-
-ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
-	
-#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
-#	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+# projects
+ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
+
+ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
+	
+#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+#	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)

 TwoPunctureABE: $(TwoPunctureFILES)
 	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)

-clean:
-	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
+clean:
+	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f