Add MPI+OpenMP hybrid parallelism (48 ranks x 2 threads) for full 96-core utilization

Enable OpenMP threading in finite-difference kernels (diff_new, diff_new_sh, diff_newwb, lopsidediff, kodiss, kodiss_sh) with collapse(3) directives on 36 triple-nested loops. Update build flags (-qopenmp), MPI process binding, and runtime configuration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:53:15 +08:00
parent 223ec17a54
commit 4eb698f496
9 changed files with 65 additions and 27 deletions
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -8,17 +8,17 @@ filein  = -I/usr/include/ -I${MKLROOT}/include

 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp

 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
-## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
+## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism)
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx