diff --git a/AMSS_NCKU_source/diff_new.f90 b/AMSS_NCKU_source/diff_new.f90 index 93954f1..c7be8d7 100644 --- a/AMSS_NCKU_source/diff_new.f90 +++ b/AMSS_NCKU_source/diff_new.f90 @@ -69,6 +69,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -151,6 +152,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -218,6 +220,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -282,6 +285,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -371,6 +375,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -469,6 +474,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -531,6 +537,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -594,6 +601,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -657,6 +665,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -719,6 +728,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -780,6 +790,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -866,6 +877,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -997,6 +1009,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1151,6 +1164,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1227,6 +1241,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1297,6 +1312,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1401,6 +1417,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1576,6 +1593,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1643,6 +1661,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1712,6 +1731,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1781,6 +1801,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1851,6 +1872,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -1919,6 +1941,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2011,6 +2034,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2127,6 +2151,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2212,6 +2237,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2288,6 +2314,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2406,6 +2433,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2593,6 +2621,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2665,6 +2694,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2740,6 +2770,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2815,6 +2846,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2895,6 +2927,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -2973,6 +3006,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3080,6 +3114,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3216,6 +3251,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3311,6 +3347,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3395,6 +3432,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3530,6 +3568,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3802,6 +3841,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3883,6 +3923,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -3967,6 +4008,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4051,6 +4093,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4153,6 +4196,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 @@ -4253,6 +4297,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 diff --git a/AMSS_NCKU_source/diff_new_sh.f90 b/AMSS_NCKU_source/diff_new_sh.f90 index 91d21d7..8bcd7c1 100644 --- a/AMSS_NCKU_source/diff_new_sh.f90 +++ b/AMSS_NCKU_source/diff_new_sh.f90 @@ -81,6 +81,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -179,6 +180,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -262,6 +264,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -342,6 +345,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -443,6 +447,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -553,6 +558,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -627,6 +633,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -702,6 +709,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -777,6 +785,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -851,6 +860,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -924,6 +934,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1019,6 +1030,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1134,6 +1146,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1227,6 +1240,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1314,6 +1328,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1430,6 +1445,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1580,6 +1596,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1659,6 +1676,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1740,6 +1758,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1821,6 +1840,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1903,6 +1923,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1983,6 +2004,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2087,6 +2109,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2219,6 +2242,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2321,6 +2345,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2414,6 +2439,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2544,6 +2570,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2743,6 +2770,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2827,6 +2855,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -2914,6 +2943,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3001,6 +3031,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3093,6 +3124,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3183,6 +3215,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3302,6 +3335,7 @@ fy = ZEO fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3454,6 +3488,7 @@ fx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3566,6 +3601,7 @@ fy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3667,6 +3703,7 @@ fz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -3814,6 +3851,7 @@ fxz = ZEO fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4098,6 +4136,7 @@ fxx = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4191,6 +4230,7 @@ fyy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4287,6 +4327,7 @@ fzz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4383,6 +4424,7 @@ fxy = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4497,6 +4539,7 @@ fxz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4609,6 +4652,7 @@ fyz = ZEO + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4679,6 +4723,7 @@ subroutine fderivs_shc(ex,f,fx,fy,fz,crho,sigma,R,SYM1,SYM2,SYM3,Symmetry,Lev,ss #if 0 integer :: i,j,k + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -4729,6 +4774,7 @@ subroutine fdderivs_shc(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,crho,sigma,R,SYM1,SYM2,SYM3 #if 0 integer :: i,j,k + !$omp parallel do collapse(2) private(i,j,k) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/enforce_algebra.f90 b/AMSS_NCKU_source/enforce_algebra.f90 index 2a511a5..bae4dcb 100644 --- a/AMSS_NCKU_source/enforce_algebra.f90 +++ b/AMSS_NCKU_source/enforce_algebra.f90 @@ -27,6 +27,7 @@ !~~~~~~> + !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,ldetg,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA,lscale) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -104,6 +105,7 @@ !~~~~~~> + !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,lscale,lgxy,lgxz,lgyz,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA) do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/kodiss.f90 b/AMSS_NCKU_source/kodiss.f90 index a12ada4..2e29313 100644 --- a/AMSS_NCKU_source/kodiss.f90 +++ b/AMSS_NCKU_source/kodiss.f90 @@ -65,7 +65,8 @@ real*8,intent(in) :: eps ! dx^4 ! note the sign (-1)^r-1, now r=2 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -159,7 +160,8 @@ integer, parameter :: NO_SYMM=0, OCTANT=2 call symmetry_bd(3,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -273,7 +275,8 @@ real*8,intent(in) :: eps ! dx^8 ! note the sign (-1)^r-1, now r=4 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -385,7 +388,8 @@ real*8,intent(in) :: eps ! dx^10 ! note the sign (-1)^r-1, now r=5 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/kodiss_sh.f90 b/AMSS_NCKU_source/kodiss_sh.f90 index c166995..7ad887e 100644 --- a/AMSS_NCKU_source/kodiss_sh.f90 +++ b/AMSS_NCKU_source/kodiss_sh.f90 @@ -80,7 +80,8 @@ real*8,intent(in) :: eps ! dx^4 ! note the sign (-1)^r-1, now r=2 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -178,7 +179,8 @@ real*8,intent(in) :: eps ! dx^4 ! note the sign (-1)^r-1, now r=2 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -273,7 +275,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(2,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -369,7 +372,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(3,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -510,7 +514,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(3,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -598,7 +603,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(3,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -694,7 +700,8 @@ real*8,intent(in) :: eps ! dx^8 ! note the sign (-1)^r-1, now r=4 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -794,7 +801,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(4,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -903,7 +911,8 @@ real*8,intent(in) :: eps ! dx^10 ! note the sign (-1)^r-1, now r=5 - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) @@ -1006,7 +1015,8 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2 call symmetry_stbd(5,ex,f,fh,SoA) - do k=1,ex(3) + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3) do j=1,ex(2) do i=1,ex(1) diff --git a/AMSS_NCKU_source/lopsidediff.f90 b/AMSS_NCKU_source/lopsidediff.f90 index 2e97af5..0bc99fd 100644 --- a/AMSS_NCKU_source/lopsidediff.f90 +++ b/AMSS_NCKU_source/lopsidediff.f90 @@ -68,7 +68,8 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) ! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also - do k=1,ex(3)-1 + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 ! x direction @@ -233,7 +234,8 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) ! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also - do k=1,ex(3)-1 + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 #if 0 @@ -558,7 +560,8 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) ! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also - do k=1,ex(3)-1 + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 ! x direction @@ -774,7 +777,8 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) ! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also - do k=1,ex(3)-1 + !$omp parallel do collapse(2) private(i,j,k) + do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 ! x direction diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index f2d4e3c..0521353 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -95,8 +95,8 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h misc.o : zbesh.o # projects -ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) +ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index c2abe44..c398111 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -6,17 +6,17 @@ ## Intel oneAPI version with oneMKL (Optimized for performance) filein = -I/usr/include/ -I${MKLROOT}/include -## Using sequential MKL (OpenMP disabled for better single-threaded performance) +## Using OpenMP-threaded MKL for parallel performance ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) ## -fprofile-instr-use: use collected profile data to guide optimization decisions ## (branch prediction, basic block layout, inlining, loop unrolling) PROFDATA = /home/amss/AMSS-NCKU/pgo_profile/default.profdata -CXXAPPFLAGS = -O3 -march=native -fp-model fast=2 -fma -ipo \ +CXXAPPFLAGS = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \ -DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -march=native -fp-model fast=2 -fma -ipo \ +f90appflags = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \ -align array64byte -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx diff --git a/AMSS_NCKU_source/rungekutta4_rout.f90 b/AMSS_NCKU_source/rungekutta4_rout.f90 index 1156c8c..82b2dfe 100644 --- a/AMSS_NCKU_source/rungekutta4_rout.f90 +++ b/AMSS_NCKU_source/rungekutta4_rout.f90 @@ -109,23 +109,33 @@ if( RK4 == 0 ) then + !$omp parallel workshare f1 = f0 + HLF * dT * f_rhs + !$omp end parallel workshare elseif(RK4 == 1 ) then + !$omp parallel workshare f_rhs = f_rhs + TWO * f1 - + !$omp end parallel workshare + !$omp parallel workshare f1 = f0 + HLF * dT * f1 + !$omp end parallel workshare elseif(RK4 == 2 ) then + !$omp parallel workshare f_rhs = f_rhs + TWO * f1 - + !$omp end parallel workshare + !$omp parallel workshare f1 = f0 + dT * f1 + !$omp end parallel workshare elseif( RK4 == 3 ) then - + + !$omp parallel workshare f1 = f0 +F1o6 * dT *(f1 + f_rhs) + !$omp end parallel workshare else @@ -134,7 +144,7 @@ endif - return + return end subroutine rungekutta4_rout !----------------------------------------------------------------------------- @@ -215,15 +225,19 @@ if( RK4 == 0 ) then + !$omp parallel workshare f1 = f0 + dT * f_rhs + !$omp end parallel workshare else + !$omp parallel workshare f1 = f0 + HLF * dT * (f1+f_rhs) + !$omp end parallel workshare endif - return + return end subroutine icn_rout !~~~~~~~~~~~~~~~~~~ @@ -239,8 +253,10 @@ real*8, dimension(ex(1),ex(2),ex(3)),intent(in) ::f_rhs real*8, dimension(ex(1),ex(2),ex(3)),intent(out) ::f1 + !$omp parallel workshare f1 = f0 + dT * f_rhs + !$omp end parallel workshare - return + return end subroutine euler_rout diff --git a/makefile_and_run.py b/makefile_and_run.py index c8b0ddc..5e4f9f7 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -11,18 +11,30 @@ import AMSS_NCKU_Input as input_data import subprocess import time +import os + +## OpenMP configuration for threaded Fortran kernels +## OMP_NUM_THREADS: set to number of physical cores (not hyperthreads) +## OMP_PROC_BIND: bind threads to cores to avoid migration overhead +## OMP_STACKSIZE: each thread needs stack space for fh arrays (~3.6MB) +if "OMP_NUM_THREADS" not in os.environ: + os.environ["OMP_NUM_THREADS"] = "96" +os.environ["OMP_STACKSIZE"] = "16M" +os.environ["OMP_PROC_BIND"] = "close" +os.environ["OMP_PLACES"] = "cores" ## CPU core binding configuration using taskset ## taskset ensures all child processes inherit the CPU affinity mask ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores #NUMACTL_CPU_BIND = "taskset -c 0-111" #NUMACTL_CPU_BIND = "taskset -c 16-47,64-95" -NUMACTL_CPU_BIND = "taskset -c 8-15" +#NUMACTL_CPU_BIND = "taskset -c 8-15" +NUMACTL_CPU_BIND = "" ## Build parallelism configuration ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores ## Set make -j to utilize available cores for faster builds -BUILD_JOBS = 96 +BUILD_JOBS = 16 ##################################################################