Compare commits
1 Commits
cjy-dystop
...
cjy-oneapi
| Author | SHA1 | Date | |
|---|---|---|---|
| 4eb698f496 |
@@ -16,7 +16,7 @@ import numpy
|
||||
File_directory = "GW150914" ## output file directory
|
||||
Output_directory = "binary_output" ## binary data file directory
|
||||
## The file directory name should not be too long
|
||||
MPI_processes = 64 ## number of mpi processes used in the simulation
|
||||
MPI_processes = 48 ## number of mpi processes used in the simulation
|
||||
|
||||
GPU_Calculation = "no" ## Use GPU or not
|
||||
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
|
||||
|
||||
@@ -997,10 +997,11 @@
|
||||
fy = ZEO
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
#if 0
|
||||
#if 0
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
@@ -1151,10 +1152,11 @@
|
||||
|
||||
fx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
! x direction
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
|
||||
@@ -1227,10 +1229,11 @@
|
||||
|
||||
fy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
! y direction
|
||||
! y direction
|
||||
if(j+2 <= jmax .and. j-2 >= jmin)then
|
||||
|
||||
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
|
||||
@@ -1297,10 +1300,11 @@
|
||||
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
! z direction
|
||||
! z direction
|
||||
if(k+2 <= kmax .and. k-2 >= kmin)then
|
||||
|
||||
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
|
||||
@@ -1401,10 +1405,11 @@
|
||||
fxz = ZEO
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
#if 0
|
||||
#if 0
|
||||
!~~~~~~ fxx
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
@@ -1576,6 +1581,7 @@
|
||||
|
||||
fxx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
@@ -1643,6 +1649,7 @@
|
||||
|
||||
fyy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
@@ -1712,6 +1719,7 @@
|
||||
|
||||
fzz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
@@ -1781,6 +1789,7 @@
|
||||
|
||||
fxy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
@@ -1851,6 +1860,7 @@
|
||||
|
||||
fxz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
@@ -1919,6 +1929,7 @@
|
||||
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
|
||||
@@ -1019,10 +1019,11 @@
|
||||
fy = ZEO
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! x direction
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
|
||||
@@ -1134,10 +1135,11 @@
|
||||
|
||||
fx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! x direction
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
|
||||
@@ -1227,10 +1229,11 @@
|
||||
|
||||
fy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! y direction
|
||||
! y direction
|
||||
if(j+2 <= jmax .and. j-2 >= jmin)then
|
||||
|
||||
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
|
||||
@@ -1314,10 +1317,11 @@
|
||||
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! z direction
|
||||
! z direction
|
||||
if(k+2 <= kmax .and. k-2 >= kmin)then
|
||||
|
||||
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
|
||||
@@ -1430,6 +1434,7 @@
|
||||
fxz = ZEO
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1580,6 +1585,7 @@
|
||||
|
||||
fxx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1659,6 +1665,7 @@
|
||||
|
||||
fyy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1740,6 +1747,7 @@
|
||||
|
||||
fzz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1821,6 +1829,7 @@
|
||||
|
||||
fxy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1903,6 +1912,7 @@
|
||||
|
||||
fxz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1983,6 +1993,7 @@
|
||||
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
@@ -1186,10 +1186,11 @@
|
||||
fy = ZEO
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! x direction
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
|
||||
@@ -1300,10 +1301,11 @@
|
||||
|
||||
fx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! x direction
|
||||
! x direction
|
||||
if(i+2 <= imax .and. i-2 >= imin)then
|
||||
!
|
||||
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
|
||||
@@ -1381,10 +1383,11 @@
|
||||
|
||||
fy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! y direction
|
||||
! y direction
|
||||
if(j+2 <= jmax .and. j-2 >= jmin)then
|
||||
|
||||
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
|
||||
@@ -1456,10 +1459,11 @@
|
||||
|
||||
fz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
! z direction
|
||||
! z direction
|
||||
if(k+2 <= kmax .and. k-2 >= kmin)then
|
||||
|
||||
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
|
||||
@@ -1565,6 +1569,7 @@
|
||||
fxz = ZEO
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1781,6 +1786,7 @@
|
||||
|
||||
fxx = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1856,6 +1862,7 @@
|
||||
|
||||
fyy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -1933,6 +1940,7 @@
|
||||
|
||||
fzz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -2010,6 +2018,7 @@
|
||||
|
||||
fxy = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -2098,6 +2107,7 @@
|
||||
|
||||
fxz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
@@ -2184,6 +2194,7 @@
|
||||
|
||||
fyz = ZEO
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
@@ -159,6 +159,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
|
||||
|
||||
call symmetry_bd(3,ex,f,fh,SoA)
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
@@ -369,11 +369,12 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
|
||||
|
||||
call symmetry_stbd(3,ex,f,fh,SoA)
|
||||
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
#if 1
|
||||
#if 1
|
||||
if(i-3 >= imin .and. i+3 <= imax .and. &
|
||||
j-3 >= jmin .and. j+3 <= jmax .and. &
|
||||
k-3 >= kmin .and. k+3 <= kmax) then
|
||||
|
||||
@@ -231,8 +231,9 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
|
||||
|
||||
call symmetry_bd(3,ex,f,fh,SoA)
|
||||
|
||||
! upper bound set ex-1 only for efficiency,
|
||||
! upper bound set ex-1 only for efficiency,
|
||||
! the loop body will set ex 0 also
|
||||
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
|
||||
@@ -8,17 +8,17 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
||||
|
||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
|
||||
|
||||
## Aggressive optimization flags:
|
||||
## -O3: Maximum optimization
|
||||
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
|
||||
## -fp-model fast=2: Aggressive floating-point optimizations
|
||||
## -fma: Enable fused multiply-add instructions
|
||||
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
|
||||
## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism)
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
|
||||
-fpp -I${MKLROOT}/include
|
||||
f90 = ifx
|
||||
f77 = ifx
|
||||
|
||||
@@ -13,13 +13,9 @@ import subprocess
|
||||
|
||||
## CPU core binding configuration using taskset
|
||||
## taskset ensures all child processes inherit the CPU affinity mask
|
||||
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
|
||||
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
|
||||
NUMACTL_CPU_BIND = "taskset -c 0-111"
|
||||
|
||||
## Build parallelism configuration
|
||||
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
|
||||
## Set make -j to utilize available cores for faster builds
|
||||
BUILD_JOBS = 104
|
||||
|
||||
|
||||
@@ -114,12 +110,18 @@ def run_ABE():
|
||||
print( )
|
||||
|
||||
## Define the command to run; cast other values to strings as needed
|
||||
|
||||
## MPI+OpenMP hybrid: compute threads per rank from total cores / MPI ranks
|
||||
omp_threads = max(1, 96 // input_data.MPI_processes)
|
||||
omp_env = (f" -genv OMP_NUM_THREADS={omp_threads}"
|
||||
f" -genv OMP_PROC_BIND=close"
|
||||
f" -genv OMP_PLACES=cores"
|
||||
f" -genv I_MPI_PIN_DOMAIN=omp")
|
||||
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABE"
|
||||
mpi_command_outfile = "ABE_out.log"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABEGPU"
|
||||
mpi_command_outfile = "ABEGPU_out.log"
|
||||
|
||||
## Execute the MPI command and stream output
|
||||
|
||||
Reference in New Issue
Block a user