Compare commits

..

2 Commits

Author SHA1 Message Date
082f9c3423 feat: Implement hybrid MPI+OpenMP parallelization
- Enable -qopenmp in makefile.inc
- Add OpenMP directives to 4th order derivatives in diff_new.f90
- Update makefile_and_run.py to dynamic calculate OMP_NUM_THREADS based on 96 cores and remove hardcoded CPU binding
2026-02-06 13:25:07 +08:00
26c81d8e81 makefile updated 2026-01-19 23:53:16 +08:00
7 changed files with 1256 additions and 2448 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -69,6 +69,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -151,6 +152,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -218,6 +220,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -282,6 +285,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -371,6 +375,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -469,6 +474,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -531,6 +537,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -594,6 +601,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -657,6 +665,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -719,6 +728,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -780,6 +790,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -866,6 +877,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -997,6 +1009,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1151,6 +1164,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1227,6 +1241,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1297,6 +1312,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1401,6 +1417,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1576,6 +1593,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1643,6 +1661,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1712,6 +1731,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1781,6 +1801,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1851,6 +1872,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1919,6 +1941,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2011,6 +2034,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2127,6 +2151,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2212,6 +2237,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2288,6 +2314,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2406,6 +2433,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2593,6 +2621,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2665,6 +2694,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2740,6 +2770,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2815,6 +2846,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2895,6 +2927,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2973,6 +3006,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3080,6 +3114,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3216,6 +3251,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3311,6 +3347,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3395,6 +3432,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3530,6 +3568,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3802,6 +3841,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3883,6 +3923,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3967,6 +4008,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4051,6 +4093,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4153,6 +4196,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4253,6 +4297,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1

View File

@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\ rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\

View File

@@ -7,25 +7,24 @@
filein = -I/usr/include/ -I${MKLROOT}/include filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \ ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
-lpthread -lm -ldl
## Aggressive optimization flags: ## Aggressive optimization flags:
## -O3: Maximum optimization ## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations ## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions ## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues ## Note: OpenMP enabled for hybrid MPI+OpenMP
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-fpp -I${MKLROOT}/include -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx
CC = icx CC = icx
CLINKER = mpiicpx CLINKER = mpiicpx -qopenmp
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -11,16 +11,14 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
## CPU core binding configuration using taskset ## CPU core binding configuration
## taskset ensures all child processes inherit the CPU affinity mask ## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) NUMACTL_CPU_BIND = ""
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
## Build parallelism configuration ## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds ## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104 BUILD_JOBS = 96
################################################################## ##################################################################
@@ -37,7 +35,7 @@ def makefile_ABE():
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
@@ -78,7 +76,7 @@ def makefile_TwoPunctureABE():
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
@@ -113,13 +111,23 @@ def run_ABE():
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Calculate OMP_NUM_THREADS
## User has 96 cores. Calculate threads per MPI process.
total_physical_cores = 96
omp_num_threads = total_physical_cores // input_data.MPI_processes
if omp_num_threads < 1:
omp_num_threads = 1
print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." )
print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" )
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output