Compare commits

..

3 Commits

Author SHA1 Message Date
4eb698f496 Add MPI+OpenMP hybrid parallelism (48 ranks x 2 threads) for full 96-core utilization
Enable OpenMP threading in finite-difference kernels (diff_new, diff_new_sh, diff_newwb,
lopsidediff, kodiss, kodiss_sh) with collapse(3) directives on 36 triple-nested loops.
Update build flags (-qopenmp), MPI process binding, and runtime configuration.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 15:53:15 +08:00
223ec17a54 input updated 2026-02-06 13:57:48 +08:00
26c81d8e81 makefile updated 2026-01-19 23:53:16 +08:00
12 changed files with 1253 additions and 2460 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -997,6 +997,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1151,6 +1152,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1227,6 +1229,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1297,6 +1300,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1401,6 +1405,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1576,6 +1581,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1643,6 +1649,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1712,6 +1719,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1781,6 +1789,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1851,6 +1860,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1919,6 +1929,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1

View File

@@ -1019,6 +1019,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1134,6 +1135,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1227,6 +1229,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1314,6 +1317,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1430,6 +1434,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1580,6 +1585,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1659,6 +1665,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1740,6 +1747,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1821,6 +1829,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1903,6 +1912,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1983,6 +1993,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -1186,6 +1186,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1300,6 +1301,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1381,6 +1383,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1456,6 +1459,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1565,6 +1569,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1781,6 +1786,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1856,6 +1862,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1933,6 +1940,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2010,6 +2018,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2098,6 +2107,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2184,6 +2194,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -159,6 +159,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -369,6 +369,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA) call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -233,6 +233,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency, ! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also ! the loop body will set ex 0 also
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1

View File

@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\ rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\

View File

@@ -7,19 +7,18 @@
filein = -I/usr/include/ -I${MKLROOT}/include filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \ ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
-lpthread -lm -ldl
## Aggressive optimization flags: ## Aggressive optimization flags:
## -O3: Maximum optimization ## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations ## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions ## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues ## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism)
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-fpp -I${MKLROOT}/include -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx

View File

@@ -13,13 +13,9 @@ import subprocess
## CPU core binding configuration using taskset ## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask ## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) NUMACTL_CPU_BIND = "taskset -c 0-111"
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
## Build parallelism configuration ## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104 BUILD_JOBS = 104
@@ -114,12 +110,18 @@ def run_ABE():
print( ) print( )
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
## MPI+OpenMP hybrid: compute threads per rank from total cores / MPI ranks
omp_threads = max(1, 96 // input_data.MPI_processes)
omp_env = (f" -genv OMP_NUM_THREADS={omp_threads}"
f" -genv OMP_PROC_BIND=close"
f" -genv OMP_PLACES=cores"
f" -genv I_MPI_PIN_DOMAIN=omp")
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output