diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index 8de5829..48ab025 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -194,7 +194,7 @@ bool bssn_escalar_cuda_bh_interp_resident_enabled() if (enabled < 0) { const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT"); - enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 0; } return enabled != 0; } diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index e70584e..2b88c67 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -1021,7 +1021,9 @@ void bssn_cuda_sync_level_bh_fields(MyList *PatL, while (BP) { Block *cg = BP->data; - if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false)) + if (myrank == cg->rank && + bssn_cuda_has_resident_state(cg) && + !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false)) { cout << "CUDA BH state subset download failed" << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -1058,7 +1060,7 @@ bool bssn_cuda_bh_interp_resident_enabled() if (env) enabled = (atoi(env) != 0) ? 1 : 0; else - enabled = 1; + enabled = 0; } return enabled != 0; } @@ -8589,6 +8591,23 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va { const int InList = 3; +#if USE_CUDA_BSSN + const bool use_resident_bh_interp = bssn_cuda_bh_interp_resident_enabled(); + if (!use_resident_bh_interp && bssn_cuda_use_resident_sync(ilev)) + { + MyList *host_state_list = 0; + if (forx == Sfx0 && fory == Sfy0 && forz == Sfz0) + host_state_list = StateList; + else if (forx == Sfx && fory == Sfy && forz == Sfz) + host_state_list = SynchList_pre; + else if (forx == Sfx1 && fory == Sfy1 && forz == Sfz1) + host_state_list = SynchList_cor; + + if (host_state_list) + bssn_cuda_download_level_state_if_present(GH->PatL[ilev], host_state_list, myrank); + } +#endif + MyList *DG_List = new MyList(forx); DG_List->insert(fory); DG_List->insert(forz); @@ -8609,7 +8628,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va int lev = ilev; #if USE_CUDA_BSSN - if (bssn_cuda_bh_interp_resident_enabled() && + if (use_resident_bh_interp && bssn_cuda_use_resident_sync(lev) && bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf)) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 6313e5f..1b0c4d5 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -7693,15 +7693,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest { double s = 1.0; if (x < 0) { - x = -x; + x = -x - 1; s *= d_comm_state_soa[3 * state_index + 0]; } if (y < 0) { - y = -y; + y = -y - 1; s *= d_comm_state_soa[3 * state_index + 1]; } if (z < 0) { - z = -z; + z = -z - 1; s *= d_comm_state_soa[3 * state_index + 2]; } const int src = x + y * nx + z * nx * ny; diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index b3f6914..fee25b0 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -18,9 +18,9 @@ OMP_FLAG = -qopenmp ifeq ($(PGO_MODE),instrument) ## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ +CXXAPPFLAGS = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \ -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ +f90appflags = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG) else ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ @@ -28,23 +28,23 @@ else ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +CXXAPPFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \ -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ +f90appflags = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \ -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG) endif -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +TP_OPTFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \ -Dfortran3 -Dnewc $(MKL_INC) else ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers ## PGO_MODE is ignored in this branch. OMP_FLAG = -mp -CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \ +CXXAPPFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \ -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) -f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \ +f90appflags = -O3 -march=znver5 -tp=host -Mcache_align -Mfma -Mpreprocess \ $(MKL_INC) $(POLINT6_FLAG) -TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \ +TP_OPTFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \ -Dfortran3 -Dnewc $(MKL_INC) endif diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 4e5b3c9..1e06ee2 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -5206,15 +5206,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest { double s = 1.0; if (x < 0) { - x = -x; + x = -x - 1; s *= d_comm_state_soa[3 * state_index + 0]; } if (y < 0) { - y = -y; + y = -y - 1; s *= d_comm_state_soa[3 * state_index + 1]; } if (z < 0) { - z = -z; + z = -z - 1; s *= d_comm_state_soa[3 * state_index + 2]; } const int src = x + y * nx + z * nx * ny;