diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py index fe25a50..f288e2a 100755 --- a/AMSS_NCKU_Input.py +++ b/AMSS_NCKU_Input.py @@ -16,7 +16,7 @@ import numpy File_directory = "GW150914" ## output file directory Output_directory = "binary_output" ## binary data file directory ## The file directory name should not be too long -MPI_processes = 64 ## number of mpi processes used in the simulation +MPI_processes = 48 ## number of mpi processes used in the simulation GPU_Calculation = "no" ## Use GPU or not ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) diff --git a/AMSS_NCKU_source/diff_new.f90 b/AMSS_NCKU_source/diff_new.f90 index 93954f1..b0d9cec 100644 --- a/AMSS_NCKU_source/diff_new.f90 +++ b/AMSS_NCKU_source/diff_new.f90 @@ -997,10 +997,11 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -#if 0 +#if 0 ! x direction if(i+2 <= imax .and. i-2 >= imin)then ! @@ -1151,10 +1152,11 @@ fx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1227,10 +1229,11 @@ fy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -! y direction +! y direction if(j+2 <= jmax .and. j-2 >= jmin)then fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) @@ -1297,10 +1300,11 @@ fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -! z direction +! z direction if(k+2 <= kmax .and. k-2 >= kmin)then fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) @@ -1401,10 +1405,11 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -#if 0 +#if 0 !~~~~~~ fxx if(i+2 <= imax .and. i-2 >= imin)then ! @@ -1576,6 +1581,7 @@ fxx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1643,6 +1649,7 @@ fyy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1712,6 +1719,7 @@ fzz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1781,6 +1789,7 @@ fxy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1851,6 +1860,7 @@ fxz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1919,6 +1929,7 @@ fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 diff --git a/AMSS_NCKU_source/diff_new_sh.f90 b/AMSS_NCKU_source/diff_new_sh.f90 index 91d21d7..7d049e1 100644 --- a/AMSS_NCKU_source/diff_new_sh.f90 +++ b/AMSS_NCKU_source/diff_new_sh.f90 @@ -1019,10 +1019,11 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1134,10 +1135,11 @@ fx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1227,10 +1229,11 @@ fy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! y direction +! y direction if(j+2 <= jmax .and. j-2 >= jmin)then fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) @@ -1314,10 +1317,11 @@ fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! z direction +! z direction if(k+2 <= kmax .and. k-2 >= kmin)then fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) @@ -1430,6 +1434,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1580,6 +1585,7 @@ fxx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1659,6 +1665,7 @@ fyy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1740,6 +1747,7 @@ fzz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1821,6 +1829,7 @@ fxy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1903,6 +1912,7 @@ fxz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1983,6 +1993,7 @@ fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/diff_newwb.f90 b/AMSS_NCKU_source/diff_newwb.f90 index e6ee09d..3cd8790 100644 --- a/AMSS_NCKU_source/diff_newwb.f90 +++ b/AMSS_NCKU_source/diff_newwb.f90 @@ -1186,10 +1186,11 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1300,10 +1301,11 @@ fx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1381,10 +1383,11 @@ fy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! y direction +! y direction if(j+2 <= jmax .and. j-2 >= jmin)then fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) @@ -1456,10 +1459,11 @@ fz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -! z direction +! z direction if(k+2 <= kmax .and. k-2 >= kmin)then fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) @@ -1565,6 +1569,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1781,6 +1786,7 @@ fxx = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1856,6 +1862,7 @@ fyy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1933,6 +1940,7 @@ fzz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2010,6 +2018,7 @@ fxy = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2098,6 +2107,7 @@ fxz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2184,6 +2194,7 @@ fyz = ZEO + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/kodiss.f90 b/AMSS_NCKU_source/kodiss.f90 index a12ada4..868bb67 100644 --- a/AMSS_NCKU_source/kodiss.f90 +++ b/AMSS_NCKU_source/kodiss.f90 @@ -159,6 +159,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2 call symmetry_bd(3,ex,f,fh,SoA) + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/kodiss_sh.f90 b/AMSS_NCKU_source/kodiss_sh.f90 index c166995..8f62358 100644 --- a/AMSS_NCKU_source/kodiss_sh.f90 +++ b/AMSS_NCKU_source/kodiss_sh.f90 @@ -369,11 +369,12 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(3,ex,f,fh,SoA) + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) -#if 1 +#if 1 if(i-3 >= imin .and. i+3 <= imax .and. & j-3 >= jmin .and. j+3 <= jmax .and. & k-3 >= kmin .and. k+3 <= kmax) then diff --git a/AMSS_NCKU_source/lopsidediff.f90 b/AMSS_NCKU_source/lopsidediff.f90 index 2e97af5..471d089 100644 --- a/AMSS_NCKU_source/lopsidediff.f90 +++ b/AMSS_NCKU_source/lopsidediff.f90 @@ -231,8 +231,9 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) call symmetry_bd(3,ex,f,fh,SoA) -! upper bound set ex-1 only for efficiency, +! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also + !$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 8068ef3..f2e909a 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -8,17 +8,17 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp ## Aggressive optimization flags: ## -O3: Maximum optimization ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -fp-model fast=2: Aggressive floating-point optimizations ## -fma: Enable fused multiply-add instructions -## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \ +## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism) +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma \ +f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx diff --git a/makefile_and_run.py b/makefile_and_run.py index 72ded5b..fc672a4 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -13,13 +13,9 @@ import subprocess ## CPU core binding configuration using taskset ## taskset ensures all child processes inherit the CPU affinity mask -## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) -## Format: taskset -c 4-55,60-111 ensures processes only run on these cores NUMACTL_CPU_BIND = "taskset -c 0-111" ## Build parallelism configuration -## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores -## Set make -j to utilize available cores for faster builds BUILD_JOBS = 104 @@ -114,12 +110,18 @@ def run_ABE(): print( ) ## Define the command to run; cast other values to strings as needed - + ## MPI+OpenMP hybrid: compute threads per rank from total cores / MPI ranks + omp_threads = max(1, 96 // input_data.MPI_processes) + omp_env = (f" -genv OMP_NUM_THREADS={omp_threads}" + f" -genv OMP_PROC_BIND=close" + f" -genv OMP_PLACES=cores" + f" -genv I_MPI_PIN_DOMAIN=omp") + if (input_data.GPU_Calculation == "no"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output