Add OpenMP parallelization to Fortran compute kernels

Add !$omp parallel do collapse(2) directives to all triple-loop stencil kernels (fderivs, fdderivs, fdx/fdy/fdz, kodis, lopsided, enforce_ag/enforce_ga) across all ghost_width variants. Add !$omp parallel workshare to RK4/ICN/Euler whole-array update routines. Build system: add -qopenmp to compile and link flags, switch MKL from sequential to threaded (-lmkl_intel_thread -liomp5). Runtime: set OMP_NUM_THREADS=96, OMP_STACKSIZE=16M, OMP_PROC_BIND=close, OMP_PLACES=cores for 96-core server target. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-10 23:40:17 +08:00
parent caf192b2e4
commit 714c6e90c6
10 changed files with 171 additions and 32 deletions
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -6,17 +6,17 @@
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include

-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Using OpenMP-threaded MKL for parallel performance
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl

 ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
 ## -fprofile-instr-use: use collected profile data to guide optimization decisions
 ##   (branch prediction, basic block layout, inlining, loop unrolling)
 PROFDATA     = /home/amss/AMSS-NCKU/pgo_profile/default.profdata
-CXXAPPFLAGS  = -O3 -march=native -fp-model fast=2 -fma -ipo \
+CXXAPPFLAGS  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
               -DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -march=native -fp-model fast=2 -fma -ipo \
+f90appflags  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx