From 082f9c34236577fd3282a54864771b34c866f617 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Fri, 6 Feb 2026 13:25:07 +0800 Subject: [PATCH] feat: Implement hybrid MPI+OpenMP parallelization - Enable -qopenmp in makefile.inc - Add OpenMP directives to 4th order derivatives in diff_new.f90 - Update makefile_and_run.py to dynamic calculate OMP_NUM_THREADS based on 96 cores and remove hardcoded CPU binding --- AMSS_NCKU_source/diff_new.f90 | 45 +++++++++++++++++++++++++++++++++++ AMSS_NCKU_source/makefile.inc | 8 +++---- makefile_and_run.py | 28 ++++++++++++++-------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/AMSS_NCKU_source/diff_new.f90 b/AMSS_NCKU_source/diff_new.f90 index 93954f1..9e4c7e5 100644 --- a/AMSS_NCKU_source/diff_new.f90 +++ b/AMSS_NCKU_source/diff_new.f90 @@ -69,6 +69,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -151,6 +152,7 @@ fx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -218,6 +220,7 @@ fy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -282,6 +285,7 @@ fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -371,6 +375,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -469,6 +474,7 @@ fxx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -531,6 +537,7 @@ fyy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -594,6 +601,7 @@ fzz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -657,6 +665,7 @@ fxy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -719,6 +728,7 @@ fxz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -780,6 +790,7 @@ fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -866,6 +877,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -997,6 +1009,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1151,6 +1164,7 @@ fx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1227,6 +1241,7 @@ fy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1297,6 +1312,7 @@ fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1401,6 +1417,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1576,6 +1593,7 @@ fxx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1643,6 +1661,7 @@ fyy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1712,6 +1731,7 @@ fzz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1781,6 +1801,7 @@ fxy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1851,6 +1872,7 @@ fxz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1919,6 +1941,7 @@ fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2011,6 +2034,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2127,6 +2151,7 @@ fx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2212,6 +2237,7 @@ fy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2288,6 +2314,7 @@ fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2406,6 +2433,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2593,6 +2621,7 @@ fxx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2665,6 +2694,7 @@ fyy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2740,6 +2770,7 @@ fzz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2815,6 +2846,7 @@ fxy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2895,6 +2927,7 @@ fxz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2973,6 +3006,7 @@ fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3080,6 +3114,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3216,6 +3251,7 @@ fx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3311,6 +3347,7 @@ fy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3395,6 +3432,7 @@ fz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3530,6 +3568,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3802,6 +3841,7 @@ fxx = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3883,6 +3923,7 @@ fyy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3967,6 +4008,7 @@ fzz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4051,6 +4093,7 @@ fxy = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4153,6 +4196,7 @@ fxz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4253,6 +4297,7 @@ fyz = ZEO + !$omp parallel do collapse(3) schedule(static) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 8068ef3..3fddbee 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -15,16 +15,16 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -fp-model fast=2: Aggressive floating-point optimizations ## -fma: Enable fused multiply-add instructions -## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \ +## Note: OpenMP enabled for hybrid MPI+OpenMP +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma \ +f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx CXX = icpx CC = icx -CLINKER = mpiicpx +CLINKER = mpiicpx -qopenmp Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include diff --git a/makefile_and_run.py b/makefile_and_run.py index 6140f99..4d9dde5 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -11,16 +11,14 @@ import AMSS_NCKU_Input as input_data import subprocess -## CPU core binding configuration using taskset -## taskset ensures all child processes inherit the CPU affinity mask -## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) -## Format: taskset -c 4-55,60-111 ensures processes only run on these cores -NUMACTL_CPU_BIND = "taskset -c 4-55,60-111" +## CPU core binding configuration +## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP +NUMACTL_CPU_BIND = "" ## Build parallelism configuration ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores ## Set make -j to utilize available cores for faster builds -BUILD_JOBS = 104 +BUILD_JOBS = 96 ################################################################## @@ -37,7 +35,7 @@ def makefile_ABE(): print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( ) - ## Build command with CPU binding to nohz_full cores + ## Build command if (input_data.GPU_Calculation == "no"): makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" elif (input_data.GPU_Calculation == "yes"): @@ -78,7 +76,7 @@ def makefile_TwoPunctureABE(): print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( ) - ## Build command with CPU binding to nohz_full cores + ## Build command makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" ## Execute the command with subprocess.Popen and stream output @@ -113,13 +111,23 @@ def run_ABE(): print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) print( ) + ## Calculate OMP_NUM_THREADS + ## User has 96 cores. Calculate threads per MPI process. + total_physical_cores = 96 + omp_num_threads = total_physical_cores // input_data.MPI_processes + if omp_num_threads < 1: + omp_num_threads = 1 + + print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." ) + print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" ) + ## Define the command to run; cast other values to strings as needed if (input_data.GPU_Calculation == "no"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output