Switch to NVIDIA HPC Toolchain

This commit is contained in:
2026-04-29 08:30:47 +08:00
parent c689cc8dc9
commit a0dab90bcb
2 changed files with 75 additions and 46 deletions

View File

@@ -13,12 +13,15 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(TOOLCHAIN),intel)
OMP_FLAG = -qopenmp
ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
@@ -26,9 +29,24 @@ else
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
endif
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(MKL_INC)
else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
## PGO_MODE is ignored in this branch.
OMP_FLAG = -mp
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
$(MKL_INC) $(POLINT6_FLAG)
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC)
endif
.SUFFIXES: .o .f90 .C .for .cu
@@ -78,17 +96,11 @@ z4c_rhs_c.o: z4c_rhs_c.C
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
# Input files
@@ -242,7 +254,7 @@ ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f