## Toolchain selection ## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default) ## intel : Intel oneAPI toolchain (legacy path) TOOLCHAIN ?= intel ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## opt : (default) maximum performance with PGO profile-guided optimization ## instrument : PGO Phase 1 instrumentation to collect fresh profile data PGO_MODE ?= opt ## Interp_Points load balance profiling mode ## off : (default) no load balance instrumentation ## profile : Pass 1 — instrument Interp_Points to collect timing profile ## optimize : Pass 2 — read profile and apply block rebalancing INTERP_LB_MODE ?= off ifeq ($(INTERP_LB_MODE),profile) INTERP_LB_FLAGS = -DINTERP_LB_PROFILE else ifeq ($(INTERP_LB_MODE),optimize) INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE else INTERP_LB_FLAGS = endif MKLROOT ?= /home/intel/oneapi/mkl/latest MKL_LIBDIR ?= $(MKLROOT)/lib/intel64 MKL_INC ?= -I$(MKLROOT)/include NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11 CUDA_HOME ?= $(NVHPC_ROOT)/cuda CUDA_ARCH ?= sm_80 ## Kernel implementation switch ## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) ## 0 : fall back to original Fortran kernels USE_CXX_KERNELS ?= 1 ## Z4C Cartesian RHS kernel switch ## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster) ## 0 : use original Fortran Z4c_rhs.o USE_CXX_Z4C_KERNELS ?= 1 ## RK4 kernel implementation switch ## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 0 : use original Fortran rungekutta4_rout.o USE_CXX_RK4 ?= 1 ## Memory allocator switch ## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) ## 0 : use system default allocator (ptmalloc) USE_TBBMALLOC ?= 1 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so ifneq ($(wildcard $(TBBMALLOC_SO)),) TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed else TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed endif ifeq ($(TOOLCHAIN),intel) f90 = ifx f77 = ifx CXX = icpx CC = icx CLINKER = mpiicpx filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \ -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ -lifcore -limf -liomp5 -lpthread -lm -ldl \ -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart else ifeq ($(TOOLCHAIN),nvhpc) f90 = mpifort f77 = mpifort CXX = mpicxx CC = mpicc CLINKER = mpicxx filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \ -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ -lpthread -lm -ldl \ -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \ -fortranlibs endif ifeq ($(USE_TBBMALLOC),1) LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) endif Cu = $(NVHPC_ROOT)/compilers/bin/nvcc CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)