Files
AMSS-NCKU/AMSS_NCKU_source/makefile.inc
CGH0S7 714c6e90c6 Add OpenMP parallelization to Fortran compute kernels
Add !$omp parallel do collapse(2) directives to all triple-loop
stencil kernels (fderivs, fdderivs, fdx/fdy/fdz, kodis, lopsided,
enforce_ag/enforce_ga) across all ghost_width variants. Add !$omp
parallel workshare to RK4/ICN/Euler whole-array update routines.

Build system: add -qopenmp to compile and link flags, switch MKL
from sequential to threaded (-lmkl_intel_thread -liomp5).

Runtime: set OMP_NUM_THREADS=96, OMP_STACKSIZE=16M, OMP_PROC_BIND=close,
OMP_PLACES=cores for 96-core server target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-10 23:40:17 +08:00

31 lines
1.8 KiB
PHP
Executable File

## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## Using OpenMP-threaded MKL for parallel performance
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
## -fprofile-instr-use: use collected profile data to guide optimization decisions
## (branch prediction, basic block layout, inlining, loop unrolling)
PROFDATA = /home/amss/AMSS-NCKU/pgo_profile/default.profdata
CXXAPPFLAGS = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
-DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = icpx
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc