Fix CUDA AMR symmetry drift
This commit is contained in:
@@ -194,7 +194,7 @@ bool bssn_escalar_cuda_bh_interp_resident_enabled()
|
|||||||
if (enabled < 0)
|
if (enabled < 0)
|
||||||
{
|
{
|
||||||
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
|
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
|
||||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 0;
|
||||||
}
|
}
|
||||||
return enabled != 0;
|
return enabled != 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1021,7 +1021,9 @@ void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
|
|||||||
while (BP)
|
while (BP)
|
||||||
{
|
{
|
||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
if (myrank == cg->rank &&
|
||||||
|
bssn_cuda_has_resident_state(cg) &&
|
||||||
|
!bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
||||||
{
|
{
|
||||||
cout << "CUDA BH state subset download failed" << endl;
|
cout << "CUDA BH state subset download failed" << endl;
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
@@ -1058,7 +1060,7 @@ bool bssn_cuda_bh_interp_resident_enabled()
|
|||||||
if (env)
|
if (env)
|
||||||
enabled = (atoi(env) != 0) ? 1 : 0;
|
enabled = (atoi(env) != 0) ? 1 : 0;
|
||||||
else
|
else
|
||||||
enabled = 1;
|
enabled = 0;
|
||||||
}
|
}
|
||||||
return enabled != 0;
|
return enabled != 0;
|
||||||
}
|
}
|
||||||
@@ -8589,6 +8591,23 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
|||||||
{
|
{
|
||||||
const int InList = 3;
|
const int InList = 3;
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
const bool use_resident_bh_interp = bssn_cuda_bh_interp_resident_enabled();
|
||||||
|
if (!use_resident_bh_interp && bssn_cuda_use_resident_sync(ilev))
|
||||||
|
{
|
||||||
|
MyList<var> *host_state_list = 0;
|
||||||
|
if (forx == Sfx0 && fory == Sfy0 && forz == Sfz0)
|
||||||
|
host_state_list = StateList;
|
||||||
|
else if (forx == Sfx && fory == Sfy && forz == Sfz)
|
||||||
|
host_state_list = SynchList_pre;
|
||||||
|
else if (forx == Sfx1 && fory == Sfy1 && forz == Sfz1)
|
||||||
|
host_state_list = SynchList_cor;
|
||||||
|
|
||||||
|
if (host_state_list)
|
||||||
|
bssn_cuda_download_level_state_if_present(GH->PatL[ilev], host_state_list, myrank);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
MyList<var> *DG_List = new MyList<var>(forx);
|
MyList<var> *DG_List = new MyList<var>(forx);
|
||||||
DG_List->insert(fory);
|
DG_List->insert(fory);
|
||||||
DG_List->insert(forz);
|
DG_List->insert(forz);
|
||||||
@@ -8609,7 +8628,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
|||||||
int lev = ilev;
|
int lev = ilev;
|
||||||
|
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
if (bssn_cuda_bh_interp_resident_enabled() &&
|
if (use_resident_bh_interp &&
|
||||||
bssn_cuda_use_resident_sync(lev) &&
|
bssn_cuda_use_resident_sync(lev) &&
|
||||||
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
|
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -7693,15 +7693,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
|
|||||||
{
|
{
|
||||||
double s = 1.0;
|
double s = 1.0;
|
||||||
if (x < 0) {
|
if (x < 0) {
|
||||||
x = -x;
|
x = -x - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 0];
|
s *= d_comm_state_soa[3 * state_index + 0];
|
||||||
}
|
}
|
||||||
if (y < 0) {
|
if (y < 0) {
|
||||||
y = -y;
|
y = -y - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 1];
|
s *= d_comm_state_soa[3 * state_index + 1];
|
||||||
}
|
}
|
||||||
if (z < 0) {
|
if (z < 0) {
|
||||||
z = -z;
|
z = -z - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 2];
|
s *= d_comm_state_soa[3 * state_index + 2];
|
||||||
}
|
}
|
||||||
const int src = x + y * nx + z * nx * ny;
|
const int src = x + y * nx + z * nx * ny;
|
||||||
|
|||||||
@@ -18,9 +18,9 @@ OMP_FLAG = -qopenmp
|
|||||||
|
|
||||||
ifeq ($(PGO_MODE),instrument)
|
ifeq ($(PGO_MODE),instrument)
|
||||||
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
CXXAPPFLAGS = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \
|
||||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
f90appflags = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \
|
||||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||||
else
|
else
|
||||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||||
@@ -28,23 +28,23 @@ else
|
|||||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||||
|
|
||||||
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
CXXAPPFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
f90appflags = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
TP_OPTFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||||
-Dfortran3 -Dnewc $(MKL_INC)
|
-Dfortran3 -Dnewc $(MKL_INC)
|
||||||
else
|
else
|
||||||
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
|
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
|
||||||
## PGO_MODE is ignored in this branch.
|
## PGO_MODE is ignored in this branch.
|
||||||
OMP_FLAG = -mp
|
OMP_FLAG = -mp
|
||||||
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
CXXAPPFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
|
||||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
|
f90appflags = -O3 -march=znver5 -tp=host -Mcache_align -Mfma -Mpreprocess \
|
||||||
$(MKL_INC) $(POLINT6_FLAG)
|
$(MKL_INC) $(POLINT6_FLAG)
|
||||||
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
TP_OPTFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
|
||||||
-Dfortran3 -Dnewc $(MKL_INC)
|
-Dfortran3 -Dnewc $(MKL_INC)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|||||||
@@ -5206,15 +5206,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
|
|||||||
{
|
{
|
||||||
double s = 1.0;
|
double s = 1.0;
|
||||||
if (x < 0) {
|
if (x < 0) {
|
||||||
x = -x;
|
x = -x - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 0];
|
s *= d_comm_state_soa[3 * state_index + 0];
|
||||||
}
|
}
|
||||||
if (y < 0) {
|
if (y < 0) {
|
||||||
y = -y;
|
y = -y - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 1];
|
s *= d_comm_state_soa[3 * state_index + 1];
|
||||||
}
|
}
|
||||||
if (z < 0) {
|
if (z < 0) {
|
||||||
z = -z;
|
z = -z - 1;
|
||||||
s *= d_comm_state_soa[3 * state_index + 2];
|
s *= d_comm_state_soa[3 * state_index + 2];
|
||||||
}
|
}
|
||||||
const int src = x + y * nx + z * nx * ny;
|
const int src = x + y * nx + z * nx * ny;
|
||||||
|
|||||||
Reference in New Issue
Block a user