Compare commits

..

1 Commits

Author SHA1 Message Date
082f9c3423 feat: Implement hybrid MPI+OpenMP parallelization
- Enable -qopenmp in makefile.inc
- Add OpenMP directives to 4th order derivatives in diff_new.f90
- Update makefile_and_run.py to dynamic calculate OMP_NUM_THREADS based on 96 cores and remove hardcoded CPU binding
2026-02-06 13:25:07 +08:00
8 changed files with 84 additions and 69 deletions

View File

@@ -69,6 +69,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -151,6 +152,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -218,6 +220,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -282,6 +285,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -371,6 +375,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -469,6 +474,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -531,6 +537,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -594,6 +601,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -657,6 +665,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -719,6 +728,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -780,6 +790,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -866,6 +877,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -997,11 +1009,11 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0 #if 0
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
@@ -1152,11 +1164,11 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1229,11 +1241,11 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
! y direction ! y direction
if(j+2 <= jmax .and. j-2 >= jmin)then if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
@@ -1300,11 +1312,11 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
! z direction ! z direction
if(k+2 <= kmax .and. k-2 >= kmin)then if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
@@ -1405,11 +1417,11 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0 #if 0
!~~~~~~ fxx !~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
@@ -1581,7 +1593,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1649,7 +1661,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1719,7 +1731,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1789,7 +1801,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1860,7 +1872,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1929,7 +1941,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096) !$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2022,6 +2034,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2138,6 +2151,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2223,6 +2237,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2299,6 +2314,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2417,6 +2433,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2604,6 +2621,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2676,6 +2694,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2751,6 +2770,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2826,6 +2846,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2906,6 +2927,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -2984,6 +3006,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3091,6 +3114,7 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3227,6 +3251,7 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3322,6 +3347,7 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3406,6 +3432,7 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3541,6 +3568,7 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3813,6 +3841,7 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3894,6 +3923,7 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -3978,6 +4008,7 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4062,6 +4093,7 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4164,6 +4196,7 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -4264,6 +4297,7 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1

View File

@@ -1019,11 +1019,10 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1135,11 +1134,10 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1229,11 +1227,10 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! y direction ! y direction
if(j+2 <= jmax .and. j-2 >= jmin)then if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
@@ -1317,11 +1314,10 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! z direction ! z direction
if(k+2 <= kmax .and. k-2 >= kmin)then if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
@@ -1434,7 +1430,6 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1585,7 +1580,6 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1665,7 +1659,6 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1747,7 +1740,6 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1829,7 +1821,6 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1912,7 +1903,6 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1993,7 +1983,6 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -1186,11 +1186,10 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1301,11 +1300,10 @@
fx = ZEO fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! x direction ! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1383,11 +1381,10 @@
fy = ZEO fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! y direction ! y direction
if(j+2 <= jmax .and. j-2 >= jmin)then if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
@@ -1459,11 +1456,10 @@
fz = ZEO fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! z direction ! z direction
if(k+2 <= kmax .and. k-2 >= kmin)then if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
@@ -1569,7 +1565,6 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1786,7 +1781,6 @@
fxx = ZEO fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1862,7 +1856,6 @@
fyy = ZEO fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -1940,7 +1933,6 @@
fzz = ZEO fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2018,7 +2010,6 @@
fxy = ZEO fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2107,7 +2098,6 @@
fxz = ZEO fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -2194,7 +2184,6 @@
fyz = ZEO fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -159,7 +159,6 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -369,12 +369,11 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA) call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
#if 1 #if 1
if(i-3 >= imin .and. i+3 <= imax .and. & if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. & j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then k-3 >= kmin .and. k+3 <= kmax) then

View File

@@ -231,9 +231,8 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
! upper bound set ex-1 only for efficiency, ! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also ! the loop body will set ex 0 also
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1

View File

@@ -8,14 +8,14 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags: ## Aggressive optimization flags:
## -O3: Maximum optimization ## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations ## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions ## -fma: Enable fused multiply-add instructions
## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism) ## Note: OpenMP enabled for hybrid MPI+OpenMP
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
@@ -24,7 +24,7 @@ f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx
CC = icx CC = icx
CLINKER = mpiicpx CLINKER = mpiicpx -qopenmp
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -11,12 +11,14 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
## CPU core binding configuration using taskset ## CPU core binding configuration
## taskset ensures all child processes inherit the CPU affinity mask ## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP
NUMACTL_CPU_BIND = "taskset -c 0-111" NUMACTL_CPU_BIND = ""
## Build parallelism configuration ## Build parallelism configuration
BUILD_JOBS = 104 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96
################################################################## ##################################################################
@@ -33,7 +35,7 @@ def makefile_ABE():
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
@@ -74,7 +76,7 @@ def makefile_TwoPunctureABE():
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
@@ -109,19 +111,23 @@ def run_ABE():
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Define the command to run; cast other values to strings as needed ## Calculate OMP_NUM_THREADS
## MPI+OpenMP hybrid: compute threads per rank from total cores / MPI ranks ## User has 96 cores. Calculate threads per MPI process.
omp_threads = max(1, 96 // input_data.MPI_processes) total_physical_cores = 96
omp_env = (f" -genv OMP_NUM_THREADS={omp_threads}" omp_num_threads = total_physical_cores // input_data.MPI_processes
f" -genv OMP_PROC_BIND=close" if omp_num_threads < 1:
f" -genv OMP_PLACES=cores" omp_num_threads = 1
f" -genv I_MPI_PIN_DOMAIN=omp")
print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." )
print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" )
## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABE" mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABEGPU" mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output