From c6e4d4ab71082cfd466560c10243ca86f77b6816 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 7 Feb 2026 13:58:55 +0800 Subject: [PATCH] Add OpenMP parallelization to BSSN RHS hot-path stencil routines Enable OpenMP threading for the dominant computational kernels: - makefile.inc: add -qopenmp to f90appflags - diff_new.f90: split fderivs/fdderivs into OpenMP interior + serial boundary - kodiss.f90: split kodis into OpenMP interior + serial boundary - lopsidediff.f90: add OMP PARALLEL DO COLLAPSE(2) to lopsided - fmisc.f90: parallelize symmetry_bd bulk array copy - bssn_rhs.f90: add OMP WORKSHARE to array-syntax operations Co-Authored-By: Claude Opus 4.6 --- AMSS_NCKU_source/bssn_rhs.f90 | 26 ++++++++- AMSS_NCKU_source/diff_new.f90 | 99 ++++++++++++++++++++++++++------ AMSS_NCKU_source/fmisc.f90 | 13 ++++- AMSS_NCKU_source/kodiss.f90 | 68 +++++++++++----------- AMSS_NCKU_source/lopsidediff.f90 | 6 +- AMSS_NCKU_source/makefile.inc | 6 +- 6 files changed, 162 insertions(+), 56 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs.f90 b/AMSS_NCKU_source/bssn_rhs.f90 index 246b219..156dcfe 100644 --- a/AMSS_NCKU_source/bssn_rhs.f90 +++ b/AMSS_NCKU_source/bssn_rhs.f90 @@ -168,6 +168,8 @@ call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) + !$OMP PARALLEL + !$OMP WORKSHARE gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) @@ -186,7 +188,7 @@ gxy * betaxz + gyy * betayz + & gxz * betaxy + gzz * betazy & - gyz * betaxx - + gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + & gxx * betaxz + gxy * betayz + & gyz * betayx + gzz * betazx & @@ -201,6 +203,8 @@ gupyy = ( gxx * gzz - gxz * gxz ) / gupzz gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz gupzz = ( gxx * gyy - gxy * gxy ) / gupzz + !$OMP END WORKSHARE + !$OMP END PARALLEL if(co == 0)then ! Gam^i_Res = Gam^i + gup^ij_,j @@ -234,6 +238,8 @@ endif ! second kind of connection + !$OMP PARALLEL + !$OMP WORKSHARE Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) @@ -282,6 +288,8 @@ (gupxy * gupyz + gupyy * gupxz)* Axy + & (gupxy * gupzz + gupyz * gupxz)* Axz + & (gupyy * gupzz + gupyz * gupyz)* Ayz + !$OMP END WORKSHARE + !$OMP END PARALLEL ! Right hand side for Gam^i without shift terms... call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) @@ -336,6 +344,8 @@ call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) + !$OMP PARALLEL + !$OMP WORKSHARE Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - & Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + & F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + & @@ -375,6 +385,8 @@ gyyz = gxz * Gamxyy + gyz * Gamyyy + gzz * Gamzyy gyzz = gxz * Gamxyz + gyz * Gamyyz + gzz * Gamzyz gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz + !$OMP END WORKSHARE + !$OMP END PARALLEL !compute Ricci tensor for tilted metric call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev) @@ -401,6 +413,8 @@ Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + & ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO + !$OMP PARALLEL + !$OMP WORKSHARE Rxx = - HALF * Rxx + & gxx * Gamxx+ gxy * Gamyx + gxz * Gamzx + & Gamxa * gxxx + Gamya * gxyx + Gamza * gxzx + & @@ -601,9 +615,13 @@ Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + & Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + & Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz ) + !$OMP END WORKSHARE + !$OMP END PARALLEL !covariant second derivative of chi respect to tilted metric call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) + !$OMP PARALLEL + !$OMP WORKSHARE fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz @@ -626,11 +644,15 @@ Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO + !$OMP END WORKSHARE + !$OMP END PARALLEL ! covariant second derivatives of the lapse respect to physical metric call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & SYM,SYM,SYM,symmetry,Lev) + !$OMP PARALLEL + !$OMP WORKSHARE gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 @@ -791,6 +813,8 @@ !!!! gauge variable part Lap_rhs = -TWO*alpn1*trK + !$OMP END WORKSHARE + !$OMP END PARALLEL #if (GAUGE == 0) betax_rhs = FF*dtSfx betay_rhs = FF*dtSfy diff --git a/AMSS_NCKU_source/diff_new.f90 b/AMSS_NCKU_source/diff_new.f90 index 93954f1..9561fa3 100644 --- a/AMSS_NCKU_source/diff_new.f90 +++ b/AMSS_NCKU_source/diff_new.f90 @@ -997,11 +997,11 @@ fy = ZEO fz = ZEO +#if 0 do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -#if 0 -! x direction +! x direction if(i+2 <= imax .and. i-2 >= imin)then ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) @@ -1018,7 +1018,7 @@ ! set imax and imin 0 endif -! y direction +! y direction if(j+2 <= jmax .and. j-2 >= jmin)then fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) @@ -1029,7 +1029,7 @@ ! set jmax and jmin 0 endif -! z direction +! z direction if(k+2 <= kmax .and. k-2 >= kmin)then fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) @@ -1040,9 +1040,13 @@ ! set kmax and kmin 0 endif + enddo + enddo + enddo #elif 0 -! x direction - if(i+2 <= imax .and. i-2 >= imin)then + do k=1,ex(3)-1 + do j=1,ex(2)-1 + do i=1,ex(1)-1 ! ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! fx(i) = --------------------------------------------- @@ -1079,8 +1083,32 @@ ! set kmax and kmin 0 endif + enddo + enddo + enddo #else -! for bam comparison +! for bam comparison — split into branch-free interior + serial boundary +! Interior: all stencil points guaranteed in-bounds, no branches needed + !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) + do k=max(3,1),min(ex(3)-1,kmax-2) + do j=max(3,1),min(ex(2)-1,jmax-2) + !DIR$ IVDEP + do i=max(3,1),min(ex(1)-1,imax-2) + fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k)) + fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) + fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) + enddo + enddo + enddo + !$OMP END PARALLEL DO + +! Boundary shell: original branching logic for points near edges + do k=1,ex(3)-1 + do j=1,ex(2)-1 + do i=1,ex(1)-1 + if(i >= 3 .and. i <= imax-2 .and. & + j >= 3 .and. j <= jmax-2 .and. & + k >= 3 .and. k <= kmax-2) cycle if(i+2 <= imax .and. i-2 >= imin .and. & j+2 <= jmax .and. j-2 >= jmin .and. & k+2 <= kmax .and. k-2 >= kmin) then @@ -1094,10 +1122,10 @@ fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k)) fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1)) endif + enddo + enddo + enddo #endif - enddo - enddo - enddo return @@ -1401,10 +1429,10 @@ fxz = ZEO fyz = ZEO +#if 0 do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -#if 0 !~~~~~~ fxx if(i+2 <= imax .and. i-2 >= imin)then ! @@ -1481,9 +1509,48 @@ - (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2))) elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) - endif + endif + enddo + enddo + enddo #else -! for bam comparison +! for bam comparison — split into branch-free interior + serial boundary +! Interior: all stencil points guaranteed in-bounds, no branches needed + !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) + do k=max(3,1),min(ex(3)-1,kmax-2) + do j=max(3,1),min(ex(2)-1,jmax-2) + !DIR$ IVDEP + do i=max(3,1),min(ex(1)-1,imax-2) + fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) & + -fh(i+2,j,k)+F16*fh(i+1,j,k) ) + fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) & + -fh(i,j+2,k)+F16*fh(i,j+1,k) ) + fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) & + -fh(i,j,k+2)+F16*fh(i,j,k+1) ) + fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) & + -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) & + +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) & + - (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k))) + fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) & + -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) & + +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) & + - (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2))) + fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) & + -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) & + +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) & + - (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2))) + enddo + enddo + enddo + !$OMP END PARALLEL DO + +! Boundary shell: original branching logic for points near edges + do k=1,ex(3)-1 + do j=1,ex(2)-1 + do i=1,ex(1)-1 + if(i >= 3 .and. i <= imax-2 .and. & + j >= 3 .and. j <= jmax-2 .and. & + k >= 3 .and. k <= kmax-2) cycle if(i+2 <= imax .and. i-2 >= imin .and. & j+2 <= jmax .and. j-2 >= jmin .and. & k+2 <= kmax .and. k-2 >= kmin) then @@ -1518,10 +1585,10 @@ fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1)) fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) endif + enddo + enddo + enddo #endif - enddo - enddo - enddo return diff --git a/AMSS_NCKU_source/fmisc.f90 b/AMSS_NCKU_source/fmisc.f90 index 1b57677..8bf41c0 100644 --- a/AMSS_NCKU_source/fmisc.f90 +++ b/AMSS_NCKU_source/fmisc.f90 @@ -881,9 +881,18 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA) real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc real*8, dimension(1:3), intent(in) :: SoA - integer::i + integer::i,j,k + + !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) + do k=1,extc(3) + do j=1,extc(2) + do i=1,extc(1) + funcc(i,j,k) = func(i,j,k) + enddo + enddo + enddo + !$OMP END PARALLEL DO - funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) enddo diff --git a/AMSS_NCKU_source/kodiss.f90 b/AMSS_NCKU_source/kodiss.f90 index a12ada4..f2eb819 100644 --- a/AMSS_NCKU_source/kodiss.f90 +++ b/AMSS_NCKU_source/kodiss.f90 @@ -159,36 +159,12 @@ integer, parameter :: NO_SYMM=0, OCTANT=2 call symmetry_bd(3,ex,f,fh,SoA) - do k=1,ex(3) - do j=1,ex(2) - do i=1,ex(1) - - if(i-3 >= imin .and. i+3 <= imax .and. & - j-3 >= jmin .and. j+3 <= jmax .and. & - k-3 >= kmin .and. k+3 <= kmax) then -#if 0 -! x direction - f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dX/cof * ( & - (fh(i-3,j,k)+fh(i+3,j,k)) - & - SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + & - FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - & - TWT* fh(i,j,k) ) -! y direction - - f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dY/cof * ( & - (fh(i,j-3,k)+fh(i,j+3,k)) - & - SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + & - FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - & - TWT* fh(i,j,k) ) -! z direction - - f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dZ/cof * ( & - (fh(i,j,k-3)+fh(i,j,k+3)) - & - SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + & - FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - & - TWT* fh(i,j,k) ) -#else -! calculation order if important ? +! Interior: all stencil points guaranteed in-bounds + !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) + do k=4,ex(3)-3 + do j=4,ex(2)-3 + !DIR$ IVDEP + do i=4,ex(1)-3 f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( & (fh(i-3,j,k)+fh(i+3,j,k)) - & SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + & @@ -204,9 +180,37 @@ integer, parameter :: NO_SYMM=0, OCTANT=2 SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + & FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - & TWT* fh(i,j,k) )/dZ ) -#endif - endif + enddo + enddo + enddo + !$OMP END PARALLEL DO +! Boundary shell: original branching logic for points near edges + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) + if(i >= 4 .and. i <= ex(1)-3 .and. & + j >= 4 .and. j <= ex(2)-3 .and. & + k >= 4 .and. k <= ex(3)-3) cycle + if(i-3 >= imin .and. i+3 <= imax .and. & + j-3 >= jmin .and. j+3 <= jmax .and. & + k-3 >= kmin .and. k+3 <= kmax) then + f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( & + (fh(i-3,j,k)+fh(i+3,j,k)) - & + SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + & + FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - & + TWT* fh(i,j,k) )/dX + & + ( & + (fh(i,j-3,k)+fh(i,j+3,k)) - & + SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + & + FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - & + TWT* fh(i,j,k) )/dY + & + ( & + (fh(i,j,k-3)+fh(i,j,k+3)) - & + SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + & + FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - & + TWT* fh(i,j,k) )/dZ ) + endif enddo enddo enddo diff --git a/AMSS_NCKU_source/lopsidediff.f90 b/AMSS_NCKU_source/lopsidediff.f90 index 2e97af5..fe3bcbc 100644 --- a/AMSS_NCKU_source/lopsidediff.f90 +++ b/AMSS_NCKU_source/lopsidediff.f90 @@ -231,12 +231,13 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) call symmetry_bd(3,ex,f,fh,SoA) -! upper bound set ex-1 only for efficiency, +! upper bound set ex-1 only for efficiency, ! the loop body will set ex 0 also + !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) do k=1,ex(3)-1 do j=1,ex(2)-1 do i=1,ex(1)-1 -#if 0 +#if 0 !! old code ! x direction if(Sfx(i,j,k) >= ZEO .and. i+3 <= imax .and. i-1 >= imin)then @@ -482,6 +483,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA) enddo enddo enddo + !$OMP END PARALLEL DO return diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 489bbce..89855a7 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -8,7 +8,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp ## Aggressive optimization flags: ## -O3: Maximum optimization @@ -16,9 +16,9 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore ## -fp-model fast=2: Aggressive floating-point optimizations ## -fma: Enable fused multiply-add instructions ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \ -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \ -align array64byte -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx