AMSS-NCKU/AMSS_NCKU_source/makefile.inc

## GCC version (commented out)
## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran

## Intel oneAPI version with oneMKL (Optimized for performance)
filein  = -I/usr/include/ -I${MKLROOT}/include

## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5

## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
##   opt        : (default) maximum performance with PGO profile-guided optimization
##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
PGO_MODE ?= opt

## Interp_Points load balance profiling mode
##   off        : (default) no load balance instrumentation
##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
##   optimize   : Pass 2 — read profile and apply block rebalancing
INTERP_LB_MODE ?= off

ifeq ($(INTERP_LB_MODE),profile)
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
else ifeq ($(INTERP_LB_MODE),optimize)
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
else
INTERP_LB_FLAGS =
endif

## Kernel implementation switch
##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
##   0           : fall back to original Fortran kernels
USE_CXX_KERNELS ?= 1
f90          = ifx
f77          = ifx
CXX          = icpx
CC           = icx
CLINKER      = mpiicpx

Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc