GPU-accelerate Shell-Patch BSSN evolution
Phase 1: Enable GPU resident state for Cartesian patches in Shell mode. - Remove WithShell guard from bssn_cuda_use_resident_sync(). - Add GPU-to-CPU state sync before shell CPU consumers (SHStep, CS_Inter, inline shell RHS blocks). Phase 2: GPU-accelerate BSSN Shell Patch RHS. - Create bssn_gpu.h with RHS_SS_PARA macro and gpu_rhs_ss declaration. - Fix compilation bugs in legacy bssn_gpu_rhs_ss.cu (deprecated cudaThreadSynchronize, tmp_con2 redeclaration, ijkmin3_h typo, CUDA_SAFE_CALL, missing compare_result guard). - Add bssn_gpu_rhs_ss.o to CFILES_CUDA_BSSN with build rule. - Write cuda_compute_rhs_bssn_ss() wrapper bridging Fortran and GPU parameter conventions, redirect all shell RHS call sites via #define. Verified: 30-step Shell-Patch GPU run completes without errors/NaN. Step wall time ~4.4s (step_fn ~2.0s + RP ~0.68s + constraint ~0.70s). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -27,7 +27,81 @@ using namespace std;
|
||||
#include "bssn_rhs.h"
|
||||
#if USE_CUDA_BSSN
|
||||
#include "bssn_rhs_cuda.h"
|
||||
#ifdef WithShell
|
||||
#include "bssn_gpu.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN && defined(WithShell)
|
||||
// GPU-accelerated shell RHS: same parameter signature as f_compute_rhs_bssn_ss.
|
||||
// Internally calls gpu_rhs_ss with calledby=0, mpi_rank=0 (device 0).
|
||||
extern "C" {
|
||||
static int cuda_compute_rhs_bssn_ss(
|
||||
int *ex, double &T, double *crho, double *sigma, double *R,
|
||||
double *X, double *Y, double *Z,
|
||||
double *drhodx, double *drhody, double *drhodz,
|
||||
double *dsigmadx, double *dsigmady, double *dsigmadz,
|
||||
double *dRdx, double *dRdy, double *dRdz,
|
||||
double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz,
|
||||
double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
|
||||
double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz,
|
||||
double *chi, double *trK,
|
||||
double *gxx, double *gxy, double *gxz, double *gyy, double *gyz, double *gzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &sst, int &co)
|
||||
{
|
||||
return gpu_rhs_ss(0, 0, // calledby=ABE_main, mpi_rank=device_0
|
||||
ex, T, crho, sigma, R, X, Y, Z,
|
||||
drhodx, drhody, drhodz,
|
||||
dsigmadx, dsigmady, dsigmadz,
|
||||
dRdx, dRdy, dRdz,
|
||||
drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
|
||||
dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
|
||||
dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
|
||||
chi, trK,
|
||||
gxx, gxy, gxz, gyy, gyz, gzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz,
|
||||
dtSfx, dtSfy, dtSfz,
|
||||
chi_rhs, trK_rhs,
|
||||
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
||||
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
||||
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
||||
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
||||
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
||||
rho, Sx, Sy, Sz,
|
||||
Sxx, Sxy, Sxz, Syy, Syz, Szz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
ham_Res, movx_Res, movy_Res, movz_Res,
|
||||
Gmx_Res, Gmy_Res, Gmz_Res,
|
||||
Symmetry, Lev, eps, sst, co);
|
||||
}
|
||||
}
|
||||
// All call sites below that use f_compute_rhs_bssn_ss get redirected to GPU
|
||||
#define f_compute_rhs_bssn_ss cuda_compute_rhs_bssn_ss
|
||||
#endif
|
||||
|
||||
#include "initial_puncture.h"
|
||||
#include "enforce_algebra.h"
|
||||
#include "rungekutta4_rout.h"
|
||||
@@ -474,12 +548,8 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
|
||||
|
||||
bool bssn_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
#ifdef WithShell
|
||||
(void)lev;
|
||||
return false;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
|
||||
@@ -3464,6 +3534,13 @@ void bssn_class::RecursiveStep(int lev)
|
||||
// RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
if (lev == 0)
|
||||
{
|
||||
clock_t prev_clock, curr_clock;
|
||||
@@ -3622,6 +3699,16 @@ void bssn_class::ParallelStep()
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
{
|
||||
const int lev0 = 0;
|
||||
if (bssn_cuda_use_resident_sync(lev0))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
SHStep();
|
||||
#if (RPS == 1)
|
||||
{
|
||||
@@ -3976,6 +4063,13 @@ void bssn_class::ParallelStep()
|
||||
}
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
SHStep();
|
||||
// a_stream.clear();
|
||||
// a_stream.str("");
|
||||
@@ -4427,6 +4521,13 @@ void bssn_class::Step(int lev, int YN)
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
// evolve Shell Patches
|
||||
if (lev == 0)
|
||||
{
|
||||
@@ -4878,6 +4979,13 @@ void bssn_class::Step(int lev, int YN)
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
// evolve Shell Patches
|
||||
if (lev == 0)
|
||||
{
|
||||
@@ -5398,6 +5506,13 @@ void bssn_class::Step(int lev, int YN)
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
// evolve Shell Patches
|
||||
if (lev == 0)
|
||||
{
|
||||
@@ -5750,6 +5865,13 @@ void bssn_class::Step(int lev, int YN)
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
|
||||
#ifdef WithShell
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
// evolve Shell Patches
|
||||
if (lev == 0)
|
||||
{
|
||||
@@ -6673,6 +6795,14 @@ void bssn_class::SHStep()
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start Step");
|
||||
// #endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev))
|
||||
{
|
||||
for (int dl = 0; dl < GH->levels; dl++)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
|
||||
}
|
||||
#endif
|
||||
|
||||
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
||||
|
||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
|
||||
Reference in New Issue
Block a user