feat: Implement hybrid MPI+OpenMP parallelization

- Enable -qopenmp in makefile.inc
- Add OpenMP directives to 4th order derivatives in diff_new.f90
- Update makefile_and_run.py to dynamic calculate OMP_NUM_THREADS based on 96 cores and remove hardcoded CPU binding
This commit is contained in:
2026-02-06 13:25:07 +08:00
parent 26c81d8e81
commit 082f9c3423
3 changed files with 67 additions and 14 deletions

View File

@@ -15,16 +15,16 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
## Note: OpenMP enabled for hybrid MPI+OpenMP
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
CLINKER = mpiicpx -qopenmp
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include