GPU-accelerate Shell-Patch BSSN evolution

Phase 1: Enable GPU resident state for Cartesian patches in Shell mode. - Remove WithShell guard from bssn_cuda_use_resident_sync(). - Add GPU-to-CPU state sync before shell CPU consumers (SHStep, CS_Inter, inline shell RHS blocks). Phase 2: GPU-accelerate BSSN Shell Patch RHS. - Create bssn_gpu.h with RHS_SS_PARA macro and gpu_rhs_ss declaration. - Fix compilation bugs in legacy bssn_gpu_rhs_ss.cu (deprecated cudaThreadSynchronize, tmp_con2 redeclaration, ijkmin3_h typo, CUDA_SAFE_CALL, missing compare_result guard). - Add bssn_gpu_rhs_ss.o to CFILES_CUDA_BSSN with build rule. - Write cuda_compute_rhs_bssn_ss() wrapper bridging Fortran and GPU parameter conventions, redirect all shell RHS call sites via #define. Verified: 30-step Shell-Patch GPU run completes without errors/NaN. Step wall time ~4.4s (step_fn ~2.0s + RP ~0.68s + constraint ~0.70s). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-09 18:50:10 +08:00
parent 5eb49949d9
commit bd4ce3fbf3
4 changed files with 234 additions and 46 deletions
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -27,7 +27,81 @@ using namespace std;
 #include "bssn_rhs.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
+#ifdef WithShell
+#include "bssn_gpu.h"
 #endif
+#endif
+
+#if USE_CUDA_BSSN && defined(WithShell)
+// GPU-accelerated shell RHS: same parameter signature as f_compute_rhs_bssn_ss.
+// Internally calls gpu_rhs_ss with calledby=0, mpi_rank=0 (device 0).
+extern "C" {
+static int cuda_compute_rhs_bssn_ss(
+    int *ex, double &T, double *crho, double *sigma, double *R,
+    double *X, double *Y, double *Z,
+    double *drhodx, double *drhody, double *drhodz,
+    double *dsigmadx, double *dsigmady, double *dsigmadz,
+    double *dRdx, double *dRdy, double *dRdz,
+    double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz,
+    double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
+    double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz,
+    double *chi, double *trK,
+    double *gxx, double *gxy, double *gxz, double *gyy, double *gyz, double *gzz,
+    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
+    double *Gamx, double *Gamy, double *Gamz,
+    double *Lap, double *betax, double *betay, double *betaz,
+    double *dtSfx, double *dtSfy, double *dtSfz,
+    double *chi_rhs, double *trK_rhs,
+    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
+    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
+    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
+    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
+    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
+    double *rho, double *Sx, double *Sy, double *Sz,
+    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
+    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
+    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
+    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
+    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
+    double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
+    double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
+    int &Symmetry, int &Lev, double &eps, int &sst, int &co)
+{
+    return gpu_rhs_ss(0, 0,  // calledby=ABE_main, mpi_rank=device_0
+                     ex, T, crho, sigma, R, X, Y, Z,
+                     drhodx, drhody, drhodz,
+                     dsigmadx, dsigmady, dsigmadz,
+                     dRdx, dRdy, dRdz,
+                     drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
+                     dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
+                     dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
+                     chi, trK,
+                     gxx, gxy, gxz, gyy, gyz, gzz,
+                     Axx, Axy, Axz, Ayy, Ayz, Azz,
+                     Gamx, Gamy, Gamz,
+                     Lap, betax, betay, betaz,
+                     dtSfx, dtSfy, dtSfz,
+                     chi_rhs, trK_rhs,
+                     gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
+                     Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
+                     Gamx_rhs, Gamy_rhs, Gamz_rhs,
+                     Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
+                     dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
+                     rho, Sx, Sy, Sz,
+                     Sxx, Sxy, Sxz, Syy, Syz, Szz,
+                     Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
+                     Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
+                     Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
+                     Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
+                     ham_Res, movx_Res, movy_Res, movz_Res,
+                     Gmx_Res, Gmy_Res, Gmz_Res,
+                     Symmetry, Lev, eps, sst, co);
+}
+}
+// All call sites below that use f_compute_rhs_bssn_ss get redirected to GPU
+#define f_compute_rhs_bssn_ss cuda_compute_rhs_bssn_ss
+#endif
+
 #include "initial_puncture.h"
 #include "enforce_algebra.h"
 #include "rungekutta4_rout.h"
@@ -474,12 +548,8 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,

 bool bssn_cuda_use_resident_sync(int lev)
 {
-#ifdef WithShell
  (void)lev;
-  return false;
-#else
  return true;
-#endif
 }

 bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
@@ -3464,6 +3534,13 @@ void bssn_class::RecursiveStep(int lev)
    // RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);

 #ifdef WithShell
+#if USE_CUDA_BSSN
+    if (bssn_cuda_use_resident_sync(lev))
+    {
+      for (int dl = 0; dl < GH->levels; dl++)
+        bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+    }
+#endif
    if (lev == 0)
    {
      clock_t prev_clock, curr_clock;
@@ -3622,6 +3699,16 @@ void bssn_class::ParallelStep()
 #endif

 #ifdef WithShell
+#if USE_CUDA_BSSN
+    {
+      const int lev0 = 0;
+      if (bssn_cuda_use_resident_sync(lev0))
+      {
+        for (int dl = 0; dl < GH->levels; dl++)
+          bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+      }
+    }
+#endif
    SHStep();
 #if (RPS == 1)
    {
@@ -3976,6 +4063,13 @@ void bssn_class::ParallelStep()
 }

 #ifdef WithShell
+#if USE_CUDA_BSSN
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
  SHStep();
  //               a_stream.clear();
  //               a_stream.str("");
@@ -4427,6 +4521,13 @@ void bssn_class::Step(int lev, int YN)
  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
+#if USE_CUDA_BSSN
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
  // evolve Shell Patches
  if (lev == 0)
  {
@@ -4878,6 +4979,13 @@ void bssn_class::Step(int lev, int YN)
    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
+#if USE_CUDA_BSSN
+    if (bssn_cuda_use_resident_sync(lev))
+    {
+      for (int dl = 0; dl < GH->levels; dl++)
+        bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+    }
+#endif
    // evolve Shell Patches
    if (lev == 0)
    {
@@ -5398,6 +5506,13 @@ void bssn_class::Step(int lev, int YN)
  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
+#if USE_CUDA_BSSN
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
  // evolve Shell Patches
  if (lev == 0)
  {
@@ -5750,6 +5865,13 @@ void bssn_class::Step(int lev, int YN)
    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls

 #ifdef WithShell
+#if USE_CUDA_BSSN
+    if (bssn_cuda_use_resident_sync(lev))
+    {
+      for (int dl = 0; dl < GH->levels; dl++)
+        bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+    }
+#endif
    // evolve Shell Patches
    if (lev == 0)
    {
@@ -6673,6 +6795,14 @@ void bssn_class::SHStep()
  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start Step");
  // #endif

+#if USE_CUDA_BSSN
+  if (bssn_cuda_use_resident_sync(lev))
+  {
+    for (int dl = 0; dl < GH->levels; dl++)
+      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
+  }
+#endif
+
  setpbh(BH_num, Porg0, Mass, BH_num_input);

  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));