Compare commits

..

2 Commits

Author SHA1 Message Date
082f9c3423 feat: Implement hybrid MPI+OpenMP parallelization
- Enable -qopenmp in makefile.inc
- Add OpenMP directives to 4th order derivatives in diff_new.f90
- Update makefile_and_run.py to dynamic calculate OMP_NUM_THREADS based on 96 cores and remove hardcoded CPU binding
2026-02-06 13:25:07 +08:00
26c81d8e81 makefile updated 2026-01-19 23:53:16 +08:00
7 changed files with 1256 additions and 2448 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -69,6 +69,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -151,6 +152,7 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -218,6 +220,7 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -282,6 +285,7 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -371,6 +375,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -469,6 +474,7 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -531,6 +537,7 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -594,6 +601,7 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -657,6 +665,7 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -719,6 +728,7 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -780,6 +790,7 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -866,6 +877,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -997,6 +1009,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1151,6 +1164,7 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1227,6 +1241,7 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1297,6 +1312,7 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1401,6 +1417,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1576,6 +1593,7 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1643,6 +1661,7 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1712,6 +1731,7 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1781,6 +1801,7 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1851,6 +1872,7 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1919,6 +1941,7 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2011,6 +2034,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2127,6 +2151,7 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2212,6 +2237,7 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2288,6 +2314,7 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2406,6 +2433,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2593,6 +2621,7 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2665,6 +2694,7 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2740,6 +2770,7 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2815,6 +2846,7 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2895,6 +2927,7 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2973,6 +3006,7 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3080,6 +3114,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3216,6 +3251,7 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3311,6 +3347,7 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3395,6 +3432,7 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3530,6 +3568,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3802,6 +3841,7 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3883,6 +3923,7 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3967,6 +4008,7 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4051,6 +4093,7 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4153,6 +4196,7 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4253,6 +4297,7 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\

View File

@@ -7,25 +7,24 @@
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags:
## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
## Note: OpenMP enabled for hybrid MPI+OpenMP
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
CLINKER = mpiicpx -qopenmp
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -11,16 +11,14 @@
import AMSS_NCKU_Input as input_data
import subprocess
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
## CPU core binding configuration
## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP
NUMACTL_CPU_BIND = ""
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104
BUILD_JOBS = 96
##################################################################
@@ -37,7 +35,7 @@ def makefile_ABE():
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( )
## Build command with CPU binding to nohz_full cores
## Build command
if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
elif (input_data.GPU_Calculation == "yes"):
@@ -78,7 +76,7 @@ def makefile_TwoPunctureABE():
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( )
## Build command with CPU binding to nohz_full cores
## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output
@@ -113,13 +111,23 @@ def run_ABE():
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
print( )
## Calculate OMP_NUM_THREADS
## User has 96 cores. Calculate threads per MPI process.
total_physical_cores = 96
omp_num_threads = total_physical_cores // input_data.MPI_processes
if omp_num_threads < 1:
omp_num_threads = 1
print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." )
print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" )
## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output