Refine GPU runtime controls and input checker

This commit is contained in:
2026-05-18 01:02:55 +08:00
parent f2264989d8
commit a99534d2f3
8 changed files with 514 additions and 67 deletions

View File

@@ -262,7 +262,10 @@ Z4c_class::~Z4c_class()
//================================================================================================
#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
#ifndef AMSS_Z4C_MRBD
#define AMSS_Z4C_MRBD 0
#endif
#define MRBD AMSS_Z4C_MRBD // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
#ifndef CPBC
// for sommerfeld boundary

View File

@@ -318,6 +318,16 @@ void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
bool bssn_em_cuda_use_resident_sync(int lev)
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
if (!env)
env = getenv("AMSS_CUDA_EM_RESIDENT_SYNC");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
if (!enabled)
return false;
#ifdef WithShell
(void)lev;
return false;

View File

@@ -65,6 +65,16 @@ bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
bool bssn_escalar_cuda_use_resident_sync(int lev)
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
if (!env)
env = getenv("AMSS_CUDA_ESCALAR_RESIDENT_SYNC");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
if (!enabled)
return false;
#ifdef WithShell
(void)lev;
return false;

View File

@@ -552,6 +552,16 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
bool bssn_cuda_use_resident_sync(int lev)
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
if (!env)
env = getenv("AMSS_CUDA_BSSN_RESIDENT_SYNC");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
if (!enabled)
return false;
(void)lev;
return true;
}

View File

@@ -2792,12 +2792,13 @@ void kern_escalar_sources(
double * __restrict__ Sxz,
double * __restrict__ Syy,
double * __restrict__ Syz,
double * __restrict__ Szz)
double * __restrict__ Szz,
double escalar_a2)
{
constexpr double PI_V = 3.141592653589793238462643383279502884;
constexpr double TWO = 2.0;
constexpr double HALF = 0.5;
constexpr double A2 = 3.0;
const double A2 = escalar_a2;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < d_gp.all;
@@ -2852,7 +2853,7 @@ void kern_escalar_sources(
}
}
static void gpu_escalar_sources(int all)
static void gpu_escalar_sources(int all, double escalar_a2)
{
#define D(s) g_buf.slot[s]
gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
@@ -2872,7 +2873,8 @@ static void gpu_escalar_sources(int all)
D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
D(S_Sphi_rhs), D(S_Spi_rhs),
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
escalar_a2);
#undef D
}
@@ -6571,7 +6573,8 @@ static int active_or_keyed_bank(StepContext &ctx,
return 0;
}
static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false)
static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false,
double escalar_a2 = 3.0)
{
const double SYM = 1.0;
const double ANTI = -1.0;
@@ -6652,7 +6655,7 @@ static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escala
D(S_gupyy), D(S_gupyz), D(S_gupzz));
if (compute_escalar) {
gpu_escalar_sources(all);
gpu_escalar_sources(all, escalar_a2);
gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
}
@@ -7127,9 +7130,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
#ifdef fortran3
set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) {
fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n",
escalar_a2);
if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) {
fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2);
return 1;
}
@@ -7187,7 +7189,7 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
}
}
launch_rhs_pipeline((int)all, eps, co, true);
launch_rhs_pipeline((int)all, eps, co, true, escalar_a2);
if (apply_bam_bc) {
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
@@ -7250,7 +7252,7 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
upload_escalar_state_inputs(state_host_in, all);
launch_rhs_pipeline((int)all, eps, 0, true);
launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2);
#define D(s) g_buf.slot[s]
kern_escalar_constraint_fr<<<grid(all), BLK>>>(

View File

@@ -111,16 +111,19 @@ TwoPunctureABE.o: TwoPunctureABE.C
# Input files
## CUDA BSSN RHS switch
## 1 : use the rewritten CUDA bssn_rhs backend
## 0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 0
USE_CUDA_Z4C ?= 0
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
## CUDA BSSN RHS switch
## 1 : use the rewritten CUDA bssn_rhs backend
## 0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 0
USE_CUDA_Z4C ?= 0
AMSS_Z4C_MRBD ?= 0
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CXXAPPFLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)
CUDA_APP_FLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)