diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 04978c2..bcf4923 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -65,10 +65,10 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ +CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ +f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) else ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ @@ -76,10 +76,10 @@ else ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ +f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) endif @@ -147,7 +147,7 @@ z4c_rhs_c.o: z4c_rhs_c.C ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +TP_OPTFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \ -fprofile-instr-use=$(TP_PROFDATA) \ -Dfortran3 -Dnewc -I${MKLROOT}/include diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 632381d..2f4ecf9 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -44,20 +44,20 @@ INTERP_LB_FLAGS = endif ## Kernel implementation switch -## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) -## 0 : fall back to original Fortran kernels -USE_CXX_KERNELS ?= 1 +## 1 : use C++ rewrite of bssn_rhs and helper kernels (faster) +## 0 (default): fall back to original Fortran kernels +USE_CXX_KERNELS ?= 0 ## Z4C Cartesian RHS kernel switch -## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster) -## 0 : use original Fortran Z4c_rhs.o -USE_CXX_Z4C_KERNELS ?= 1 +## 1 : use C++ rewrite of Z4c_rhs (main Cartesian path faster) +## 0 (default): use original Fortran Z4c_rhs.o +USE_CXX_Z4C_KERNELS ?= 0 ## BSSN-EScalar RHS switch -## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path +## 1 : use BSSN-EScalar C wrapper on the normal patch path ## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs ## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel. -USE_CXX_ESCALAR_KERNEL ?= 1 +USE_CXX_ESCALAR_KERNEL ?= 0 ## BSSN-EM RHS switch ## 1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path @@ -72,9 +72,9 @@ USE_CXX_EM_KERNEL ?= 0 USE_TRANSFER_CACHE ?= auto ## RK4 kernel implementation switch -## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) -## 0 : use original Fortran rungekutta4_rout.o -USE_CXX_RK4 ?= 1 +## 1 : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) +## 0 (default): use original Fortran rungekutta4_rout.o +USE_CXX_RK4 ?= 0 f90 = ifx f77 = ifx