diff --git a/AMSS_NCKU_source/enforce_algebra.f90 b/AMSS_NCKU_source/enforce_algebra.f90 index 71f3da2..003c24e 100644 --- a/AMSS_NCKU_source/enforce_algebra.f90 +++ b/AMSS_NCKU_source/enforce_algebra.f90 @@ -17,50 +17,62 @@ real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz -!~~~~~~~> Local variable: - - real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg - real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz - real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz - real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 - -!~~~~~~> - - gxx = dxx + ONE - gyy = dyy + ONE - gzz = dzz + ONE - - detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & - gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz - gupxx = ( gyy * gzz - gyz * gyz ) / detg - gupxy = - ( gxy * gzz - gyz * gxz ) / detg - gupxz = ( gxy * gyz - gyy * gxz ) / detg - gupyy = ( gxx * gzz - gxz * gxz ) / detg - gupyz = - ( gxx * gyz - gxy * gxz ) / detg - gupzz = ( gxx * gyy - gxy * gxy ) / detg - - trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz & - + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz) - - Axx = Axx - F1o3 * gxx * trA - Axy = Axy - F1o3 * gxy * trA - Axz = Axz - F1o3 * gxz * trA - Ayy = Ayy - F1o3 * gyy * trA - Ayz = Ayz - F1o3 * gyz * trA - Azz = Azz - F1o3 * gzz * trA - - detg = ONE / ( detg ** F1o3 ) - - gxx = gxx * detg - gxy = gxy * detg - gxz = gxz * detg - gyy = gyy * detg - gyz = gyz * detg - gzz = gzz * detg - - dxx = gxx - ONE - dyy = gyy - ONE - dzz = gzz - ONE +!~~~~~~~> Local variable: + + integer :: i,j,k + real*8 :: lgxx,lgyy,lgzz,ldetg + real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz + real*8 :: ltrA,lscale + real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 + +!~~~~~~> + + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) + + lgxx = dxx(i,j,k) + ONE + lgyy = dyy(i,j,k) + ONE + lgzz = dzz(i,j,k) + ONE + + ldetg = lgxx * lgyy * lgzz & + + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) & + - gxz(i,j,k) * lgyy * gxz(i,j,k) & + - gxy(i,j,k) * gxy(i,j,k) * lgzz & + - lgxx * gyz(i,j,k) * gyz(i,j,k) + + lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg + lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg + lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg + lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg + lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg + lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg + + ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & + + lgupzz * Azz(i,j,k) & + + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + + lgupyz * Ayz(i,j,k)) + + Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA + Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA + Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA + Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA + Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA + Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA + + lscale = ONE / ( ldetg ** F1o3 ) + + dxx(i,j,k) = lgxx * lscale - ONE + gxy(i,j,k) = gxy(i,j,k) * lscale + gxz(i,j,k) = gxz(i,j,k) * lscale + dyy(i,j,k) = lgyy * lscale - ONE + gyz(i,j,k) = gyz(i,j,k) * lscale + dzz(i,j,k) = lgzz * lscale - ONE + + enddo + enddo + enddo return @@ -81,52 +93,72 @@ real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz -!~~~~~~~> Local variable: - - real*8, dimension(ex(1),ex(2),ex(3)) :: trA - real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz - real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz - real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 - -!~~~~~~> - - gxx = dxx + ONE - gyy = dyy + ONE - gzz = dzz + ONE -! for g - gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & - gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz - - gupzz = ONE / ( gupzz ** F1o3 ) - - gxx = gxx * gupzz - gxy = gxy * gupzz - gxz = gxz * gupzz - gyy = gyy * gupzz - gyz = gyz * gupzz - gzz = gzz * gupzz - - dxx = gxx - ONE - dyy = gyy - ONE - dzz = gzz - ONE -! for A - - gupxx = ( gyy * gzz - gyz * gyz ) - gupxy = - ( gxy * gzz - gyz * gxz ) - gupxz = ( gxy * gyz - gyy * gxz ) - gupyy = ( gxx * gzz - gxz * gxz ) - gupyz = - ( gxx * gyz - gxy * gxz ) - gupzz = ( gxx * gyy - gxy * gxy ) - - trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz & - + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz) - - Axx = Axx - F1o3 * gxx * trA - Axy = Axy - F1o3 * gxy * trA - Axz = Axz - F1o3 * gxz * trA - Ayy = Ayy - F1o3 * gyy * trA - Ayz = Ayz - F1o3 * gyz * trA - Azz = Azz - F1o3 * gzz * trA +!~~~~~~~> Local variable: + + integer :: i,j,k + real*8 :: lgxx,lgyy,lgzz,lscale + real*8 :: lgxy,lgxz,lgyz + real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz + real*8 :: ltrA + real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 + +!~~~~~~> + + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) + +! for g: normalize determinant first + lgxx = dxx(i,j,k) + ONE + lgyy = dyy(i,j,k) + ONE + lgzz = dzz(i,j,k) + ONE + lgxy = gxy(i,j,k) + lgxz = gxz(i,j,k) + lgyz = gyz(i,j,k) + + lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & + + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & + - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz + + lscale = ONE / ( lscale ** F1o3 ) + + lgxx = lgxx * lscale + lgxy = lgxy * lscale + lgxz = lgxz * lscale + lgyy = lgyy * lscale + lgyz = lgyz * lscale + lgzz = lgzz * lscale + + dxx(i,j,k) = lgxx - ONE + gxy(i,j,k) = lgxy + gxz(i,j,k) = lgxz + dyy(i,j,k) = lgyy - ONE + gyz(i,j,k) = lgyz + dzz(i,j,k) = lgzz - ONE + +! for A: trace-free using normalized metric (det=1, no division needed) + lgupxx = ( lgyy * lgzz - lgyz * lgyz ) + lgupxy = - ( lgxy * lgzz - lgyz * lgxz ) + lgupxz = ( lgxy * lgyz - lgyy * lgxz ) + lgupyy = ( lgxx * lgzz - lgxz * lgxz ) + lgupyz = - ( lgxx * lgyz - lgxy * lgxz ) + lgupzz = ( lgxx * lgyy - lgxy * lgxy ) + + ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & + + lgupzz * Azz(i,j,k) & + + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + + lgupyz * Ayz(i,j,k)) + + Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA + Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA + Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA + Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA + Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA + Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA + + enddo + enddo + enddo return diff --git a/AMSS_NCKU_source/fmisc.f90 b/AMSS_NCKU_source/fmisc.f90 index 81c5a62..48c2ebc 100644 --- a/AMSS_NCKU_source/fmisc.f90 +++ b/AMSS_NCKU_source/fmisc.f90 @@ -324,8 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func + funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) enddo @@ -350,8 +349,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func + funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) @@ -379,8 +377,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func + funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) @@ -886,17 +883,20 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func - do i=0,ord-1 - funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) - enddo - do i=0,ord-1 - funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2) - enddo - do i=0,ord-1 - funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) - enddo +!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8) + funcc(1:extc(1),1:extc(2),1:extc(3)) = func +!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8) + do i=0,ord-1 + funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) + enddo +!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8) + do i=0,ord-1 + funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2) + enddo +!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8) + do i=0,ord-1 + funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) + enddo end subroutine symmetry_bd @@ -912,8 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func + funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1) @@ -941,8 +940,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 - funcc(1:extc(1),1:extc(2),1:extc(3)) = func + funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1) @@ -1113,151 +1111,353 @@ end subroutine d2dump !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -! common code for cell and vertex -!------------------------------------------------------------------------------ -! Lagrangian polynomial interpolation -!------------------------------------------------------------------------------ - - subroutine polint(xa,ya,x,y,dy,ordn) - - implicit none - -!~~~~~~> Input Parameter: - integer,intent(in) :: ordn - real*8, dimension(ordn), intent(in) :: xa,ya - real*8, intent(in) :: x - real*8, intent(out) :: y,dy - -!~~~~~~> Other parameter: - - integer :: m,n,ns - real*8, dimension(ordn) :: c,d,den,ho - real*8 :: dif,dift - -!~~~~~~> - - n=ordn - m=ordn - - c=ya - d=ya - ho=xa-x - - ns=1 - dif=abs(x-xa(1)) - do m=1,n - dift=abs(x-xa(m)) - if(dift < dif) then - ns=m - dif=dift - end if - end do - - y=ya(ns) - ns=ns-1 - do m=1,n-1 - den(1:n-m)=ho(1:n-m)-ho(1+m:n) - if (any(den(1:n-m) == 0.0))then - write(*,*) 'failure in polint for point',x - write(*,*) 'with input points: ',xa - stop - endif - den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m) - d(1:n-m)=ho(1+m:n)*den(1:n-m) - c(1:n-m)=ho(1:n-m)*den(1:n-m) - if (2*ns < n-m) then - dy=c(ns+1) - else - dy=d(ns) - ns=ns-1 - end if - y=y+dy - end do - - return - - end subroutine polint -!------------------------------------------------------------------------------ -! -! interpolation in 2 dimensions, follow yx order -! -!------------------------------------------------------------------------------ - subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) - - implicit none - -!~~~~~~> Input parameters: - integer,intent(in) :: ordn - real*8, dimension(1:ordn), intent(in) :: x1a,x2a - real*8, dimension(1:ordn,1:ordn), intent(in) :: ya - real*8, intent(in) :: x1,x2 - real*8, intent(out) :: y,dy - -!~~~~~~> Other parameters: - - integer :: i,m - real*8, dimension(ordn) :: ymtmp - real*8, dimension(ordn) :: yntmp - - m=size(x1a) - - do i=1,m - - yntmp=ya(i,:) - call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) - - end do - - call polint(x1a,ymtmp,x1,y,dy,ordn) - - return - - end subroutine polin2 -!------------------------------------------------------------------------------ -! -! interpolation in 3 dimensions, follow zyx order -! -!------------------------------------------------------------------------------ - subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) - - implicit none - -!~~~~~~> Input parameters: - integer,intent(in) :: ordn - real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a - real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya - real*8, intent(in) :: x1,x2,x3 - real*8, intent(out) :: y,dy - -!~~~~~~> Other parameters: - - integer :: i,j,m,n - real*8, dimension(ordn,ordn) :: yatmp - real*8, dimension(ordn) :: ymtmp - real*8, dimension(ordn) :: yntmp - real*8, dimension(ordn) :: yqtmp - - m=size(x1a) - n=size(x2a) - - do i=1,m - do j=1,n - - yqtmp=ya(i,j,:) - call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) - - end do - - yntmp=yatmp(i,:) - call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) - - end do - - call polint(x1a,ymtmp,x1,y,dy,ordn) - - return - - end subroutine polin3 +! common code for cell and vertex +!------------------------------------------------------------------------------ +! Lagrangian polynomial interpolation +!------------------------------------------------------------------------------ +#ifndef POLINT6_USE_BARYCENTRIC +#define POLINT6_USE_BARYCENTRIC 1 +#endif + +!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville + subroutine polint6_neville(xa, ya, x, y, dy) + implicit none + + real*8, dimension(6), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, m, ns, n_m + real*8, dimension(6) :: c, d, ho + real*8 :: dif, dift, hp, h, den_val + + c = ya + d = ya + ho = xa - x + + ns = 1 + dif = abs(x - xa(1)) + + do i = 2, 6 + dift = abs(x - xa(i)) + if (dift < dif) then + ns = i + dif = dift + end if + end do + + y = ya(ns) + ns = ns - 1 + + do m = 1, 5 + n_m = 6 - m + do i = 1, n_m + hp = ho(i) + h = ho(i+m) + den_val = hp - h + + if (den_val == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + + den_val = (c(i+1) - d(i)) / den_val + + d(i) = h * den_val + c(i) = hp * den_val + end do + + if (2 * ns < n_m) then + dy = c(ns + 1) + else + dy = d(ns) + ns = ns - 1 + end if + y = y + dy + end do + + return + end subroutine polint6_neville + +!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric + subroutine polint6_barycentric(xa, ya, x, y, dy) + implicit none + + real*8, dimension(6), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, j + logical :: is_uniform + real*8, dimension(6) :: lambda + real*8 :: dx, den_i, term, num, den, step, tol + real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /) + + do i = 1, 6 + if (x == xa(i)) then + y = ya(i) + dy = 0.d0 + return + end if + end do + + step = xa(2) - xa(1) + is_uniform = (step /= 0.d0) + if (is_uniform) then + tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step)) + do i = 3, 6 + if (abs((xa(i) - xa(i-1)) - step) > tol) then + is_uniform = .false. + exit + end if + end do + end if + + if (is_uniform) then + num = 0.d0 + den = 0.d0 + do i = 1, 6 + term = c_uniform(i) / (x - xa(i)) + num = num + term * ya(i) + den = den + term + end do + y = num / den + dy = 0.d0 + return + end if + + do i = 1, 6 + den_i = 1.d0 + do j = 1, 6 + if (j /= i) then + dx = xa(i) - xa(j) + if (dx == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + den_i = den_i * dx + end if + end do + lambda(i) = 1.d0 / den_i + end do + + num = 0.d0 + den = 0.d0 + do i = 1, 6 + term = lambda(i) / (x - xa(i)) + num = num + term * ya(i) + den = den + term + end do + + y = num / den + dy = 0.d0 + + return + end subroutine polint6_barycentric + +!DIR$ ATTRIBUTES FORCEINLINE :: polint + subroutine polint(xa, ya, x, y, dy, ordn) + implicit none + + integer, intent(in) :: ordn + real*8, dimension(ordn), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, m, ns, n_m + real*8, dimension(ordn) :: c, d, ho + real*8 :: dif, dift, hp, h, den_val + + if (ordn == 6) then +#if POLINT6_USE_BARYCENTRIC + call polint6_barycentric(xa, ya, x, y, dy) +#else + call polint6_neville(xa, ya, x, y, dy) +#endif + return + end if + + c = ya + d = ya + ho = xa - x + + ns = 1 + dif = abs(x - xa(1)) + + do i = 2, ordn + dift = abs(x - xa(i)) + if (dift < dif) then + ns = i + dif = dift + end if + end do + + y = ya(ns) + ns = ns - 1 + + do m = 1, ordn - 1 + n_m = ordn - m + do i = 1, n_m + hp = ho(i) + h = ho(i+m) + den_val = hp - h + + if (den_val == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + + den_val = (c(i+1) - d(i)) / den_val + + d(i) = h * den_val + c(i) = hp * den_val + end do + + if (2 * ns < n_m) then + dy = c(ns + 1) + else + dy = d(ns) + ns = ns - 1 + end if + y = y + dy + end do + + return + end subroutine polint +!------------------------------------------------------------------------------ +! Compute Lagrange interpolation basis weights for one target point. +!------------------------------------------------------------------------------ +!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights + subroutine polint_lagrange_weights(xa, x, w, ordn) + implicit none + + integer, intent(in) :: ordn + real*8, dimension(1:ordn), intent(in) :: xa + real*8, intent(in) :: x + real*8, dimension(1:ordn), intent(out) :: w + + integer :: i, j + real*8 :: num, den, dx + + do i = 1, ordn + num = 1.d0 + den = 1.d0 + do j = 1, ordn + if (j /= i) then + dx = xa(i) - xa(j) + if (dx == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + num = num * (x - xa(j)) + den = den * dx + end if + end do + w(i) = num / den + end do + + return + end subroutine polint_lagrange_weights +!------------------------------------------------------------------------------ +! +! interpolation in 2 dimensions, follow yx order +! +!------------------------------------------------------------------------------ + subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) + implicit none + + integer,intent(in) :: ordn + real*8, dimension(1:ordn), intent(in) :: x1a,x2a + real*8, dimension(1:ordn,1:ordn), intent(in) :: ya + real*8, intent(in) :: x1,x2 + real*8, intent(out) :: y,dy + +#ifdef POLINT_LEGACY_ORDER + integer :: i,m + real*8, dimension(ordn) :: ymtmp + real*8, dimension(ordn) :: yntmp + + m=size(x1a) + do i=1,m + yntmp=ya(i,:) + call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) + end do + call polint(x1a,ymtmp,x1,y,dy,ordn) +#else + integer :: j + real*8, dimension(ordn) :: ymtmp + real*8 :: dy_temp + + do j=1,ordn + call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn) + end do + call polint(x2a, ymtmp, x2, y, dy, ordn) +#endif + + return + end subroutine polin2 +!------------------------------------------------------------------------------ +! +! interpolation in 3 dimensions, follow zyx order +! +!------------------------------------------------------------------------------ + subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) + implicit none + + integer,intent(in) :: ordn + real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a + real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya + real*8, intent(in) :: x1,x2,x3 + real*8, intent(out) :: y,dy + +#ifdef POLINT_LEGACY_ORDER + integer :: i,j,m,n + real*8, dimension(ordn,ordn) :: yatmp + real*8, dimension(ordn) :: ymtmp + real*8, dimension(ordn) :: yntmp + real*8, dimension(ordn) :: yqtmp + + m=size(x1a) + n=size(x2a) + do i=1,m + do j=1,n + yqtmp=ya(i,j,:) + call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) + end do + yntmp=yatmp(i,:) + call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) + end do + call polint(x1a,ymtmp,x1,y,dy,ordn) +#else + integer :: i, j, k + real*8, dimension(ordn) :: w1, w2 + real*8, dimension(ordn) :: ymtmp + real*8 :: yx_sum, x_sum + + call polint_lagrange_weights(x1a, x1, w1, ordn) + call polint_lagrange_weights(x2a, x2, w2, ordn) + + do k = 1, ordn + yx_sum = 0.d0 + do j = 1, ordn + x_sum = 0.d0 + do i = 1, ordn + x_sum = x_sum + w1(i) * ya(i,j,k) + end do + yx_sum = yx_sum + w2(j) * x_sum + end do + ymtmp(k) = yx_sum + end do + + call polint(x3a, ymtmp, x3, y, dy, ordn) +#endif + + return + end subroutine polin3 !-------------------------------------------------------------------------------------- ! calculate L2norm subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& @@ -1276,7 +1476,9 @@ end subroutine d2dump real*8 :: dX, dY, dZ integer::imin,jmin,kmin integer::imax,jmax,kmax - integer::i,j,k + integer::i,j,k,n_elements + real*8, dimension(:), allocatable :: f_flat + real*8, external :: DDOT dX = X(2) - X(1) dY = Y(2) - Y(1) @@ -1300,15 +1502,91 @@ if(dabs(X(1)-xmin) < dX) imin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1 -f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax)) + n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) + allocate(f_flat(n_elements)) + f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) + f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) + deallocate(f_flat) f_out = f_out*dX*dY*dZ return end subroutine l2normhelper -!-------------------------------------------------------------------------------------- -! calculate L2norm especially for shell Blocks +!-------------------------------------------------------------------------------------- + subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& + f1,f2,f3,f4,f5,f6,f7,f_out,gw) + + implicit none +!~~~~~~> Input parameters: + integer,intent(in ):: ex(1:3) + real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax + integer,intent(in)::gw + real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7 + real*8, intent(out) :: f_out(7) +!~~~~~~> Other variables: + + real*8 :: dX, dY, dZ + integer::imin,jmin,kmin + integer::imax,jmax,kmax + integer::i,j,k + real*8 :: s1,s2,s3,s4,s5,s6,s7 + + dX = X(2) - X(1) + dY = Y(2) - Y(1) + dZ = Z(2) - Z(1) + + imin = gw+1 + jmin = gw+1 + kmin = gw+1 + + imax = ex(1) - gw + jmax = ex(2) - gw + kmax = ex(3) - gw + +if(dabs(X(ex(1))-xmax) < dX) imax = ex(1) +if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2) +if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3) +if(dabs(X(1)-xmin) < dX) imin = 1 +if(dabs(Y(1)-ymin) < dY) jmin = 1 +if(dabs(Z(1)-zmin) < dZ) kmin = 1 + + s1 = 0.d0 + s2 = 0.d0 + s3 = 0.d0 + s4 = 0.d0 + s5 = 0.d0 + s6 = 0.d0 + s7 = 0.d0 + + do k=kmin,kmax + do j=jmin,jmax +!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7) + do i=imin,imax + s1 = s1 + f1(i,j,k)*f1(i,j,k) + s2 = s2 + f2(i,j,k)*f2(i,j,k) + s3 = s3 + f3(i,j,k)*f3(i,j,k) + s4 = s4 + f4(i,j,k)*f4(i,j,k) + s5 = s5 + f5(i,j,k)*f5(i,j,k) + s6 = s6 + f6(i,j,k)*f6(i,j,k) + s7 = s7 + f7(i,j,k)*f7(i,j,k) + enddo + enddo + enddo + + f_out(1) = s1*dX*dY*dZ + f_out(2) = s2*dX*dY*dZ + f_out(3) = s3*dX*dY*dZ + f_out(4) = s4*dX*dY*dZ + f_out(5) = s5*dX*dY*dZ + f_out(6) = s6*dX*dY*dZ + f_out(7) = s7*dX*dY*dZ + + return + + end subroutine l2normhelper7 +!-------------------------------------------------------------------------------------- +! calculate L2norm especially for shell Blocks subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& f,f_out,gw,ogw,Symmetry) @@ -1325,7 +1603,9 @@ f_out = f_out*dX*dY*dZ real*8 :: dX, dY, dZ integer::imin,jmin,kmin integer::imax,jmax,kmax - integer::i,j,k + integer::i,j,k,n_elements + real*8, dimension(:), allocatable :: f_flat + real*8, external :: DDOT real*8 :: PIo4 @@ -1388,7 +1668,11 @@ if(Symmetry==2)then if(dabs(ymin+gw*dY) Other parameters: real*8, dimension(-ORDN+1:ex(1)+ORDN,-ORDN+1:ex(2)+ORDN,ex(3)) :: fh - integer :: m - integer :: cxB,cxT - real*8, dimension(ORDN) :: ya - real*8 :: SoAh - integer,dimension(3) :: inds + integer :: m + integer :: cxB,cxT + real*8, dimension(ORDN) :: ya + real*8 :: SoAh + integer,dimension(3) :: inds + real*8, external :: DDOT ! +1 because c++ gives 0 for first point inds = indsi + 1 @@ -1886,10 +2172,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd endif - f_int=0 - do m=1,ORDN - f_int = f_int + coef(m)*ya(m) - enddo + f_int = DDOT(ORDN, coef, 1, ya, 1) return @@ -2125,20 +2408,28 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) implicit none integer,intent(in) :: N - real*8 :: gont - - integer :: i + real*8 :: gont + + integer :: i + real*8, parameter, dimension(0:20) :: fact_table = [ & + 1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, & + 362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, & + 87178291200.d0, 1307674368000.d0, 20922789888000.d0, & + 355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, & + 2432902008176640000.d0 ] ! sanity check - if(N < 0)then - write(*,*) "ffact: error input for factorial" - return - endif - - gont = 1.d0 - do i=1,N - gont = gont*i - enddo + if(N < 0)then + write(*,*) "ffact: error input for factorial" + gont = 1.d0 + return + endif + + if(N <= 20)then + gont = fact_table(N) + else + gont = exp(log_gamma(dble(N+1))) + endif return diff --git a/AMSS_NCKU_source/fmisc.h b/AMSS_NCKU_source/fmisc.h index 1a0b6d6..9412cff 100644 --- a/AMSS_NCKU_source/fmisc.h +++ b/AMSS_NCKU_source/fmisc.h @@ -12,9 +12,10 @@ #define f_global_interpind global_interpind #define f_global_interpind2d global_interpind2d #define f_global_interpind1d global_interpind1d -#define f_l2normhelper l2normhelper -#define f_l2normhelper_sh l2normhelper_sh -#define f_l2normhelper_sh_rms l2normhelper_sh_rms +#define f_l2normhelper l2normhelper +#define f_l2normhelper7 l2normhelper7 +#define f_l2normhelper_sh l2normhelper_sh +#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_average average #define f_average3 average3 #define f_average2 average2 @@ -41,9 +42,10 @@ #define f_global_interpind GLOBAL_INTERPIND #define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind1d GLOBAL_INTERPIND1D -#define f_l2normhelper L2NORMHELPER -#define f_l2normhelper_sh L2NORMHELPER_SH -#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS +#define f_l2normhelper L2NORMHELPER +#define f_l2normhelper7 L2NORMHELPER7 +#define f_l2normhelper_sh L2NORMHELPER_SH +#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_average AVERAGE #define f_average3 AVERAGE3 #define f_average2 AVERAGE2 @@ -70,9 +72,10 @@ #define f_global_interpind global_interpind_ #define f_global_interpind2d global_interpind2d_ #define f_global_interpind1d global_interpind1d_ -#define f_l2normhelper l2normhelper_ -#define f_l2normhelper_sh l2normhelper_sh_ -#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ +#define f_l2normhelper l2normhelper_ +#define f_l2normhelper7 l2normhelper7_ +#define f_l2normhelper_sh l2normhelper_sh_ +#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_average average_ #define f_average3 average3_ #define f_average2 average2_ @@ -156,21 +159,30 @@ extern "C" int *, double *, int &, int &); } -extern "C" -{ - void f_l2normhelper(int *, double *, double *, double *, - double &, double &, double &, - double &, double &, double &, - double *, double &, int &); -} - -extern "C" -{ - void f_l2normhelper_sh(int *, double *, double *, double *, - double &, double &, double &, - double &, double &, double &, - double *, double &, int &, int &, int &); -} +extern "C" +{ + void f_l2normhelper(int *, double *, double *, double *, + double &, double &, double &, + double &, double &, double &, + double *, double &, int &); +} + +extern "C" +{ + void f_l2normhelper7(int *, double *, double *, double *, + double &, double &, double &, + double &, double &, double &, + double *, double *, double *, double *, + double *, double *, double *, double *, int &); +} + +extern "C" +{ + void f_l2normhelper_sh(int *, double *, double *, double *, + double &, double &, double &, + double &, double &, double &, + double *, double &, int &, int &, int &); +} extern "C" { diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h index ca67877..5073c46 100644 --- a/AMSS_NCKU_source/macrodef.h +++ b/AMSS_NCKU_source/macrodef.h @@ -2,7 +2,7 @@ #ifndef MICRODEF_H #define MICRODEF_H -#include "microdef.fh" +#include "macrodef.fh" // application parameters diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 0e2a08d..0a3f676 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,11 +1,25 @@ - - -include makefile.inc - -.SUFFIXES: .o .f90 .C .for .cu - -.f90.o: - $(f90) $(f90appflags) -c $< -o $@ + + +include makefile.inc + +## polint(ordn=6) kernel selector: +## 1 (default): barycentric fast path +## 0 : fallback to Neville path +POLINT6_USE_BARY ?= 1 +POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) + +ARCH_OPT = -march=x86-64-v4 +CXXAPPFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include +f90appflags = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +TP_OPTFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include + +.SUFFIXES: .o .f90 .C .for .cu + +.f90.o: + $(f90) $(f90appflags) -c $< -o $@ .C.o: ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ @@ -13,8 +27,14 @@ include makefile.inc .for.o: $(f77) -c $< -o $@ -.cu.o: - $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) +.cu.o: + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + +TwoPunctures.o: TwoPunctures.C + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ + +TwoPunctureABE.o: TwoPunctureABE.C + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ # Input files C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ @@ -95,8 +115,8 @@ ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) -TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS) +TwoPunctureABE: $(TwoPunctureFILES) + $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) clean: rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 00b4bc3..bfea429 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -1,19 +1,32 @@ +## GCC version (commented out) ## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran -filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## Intel oneAPI version with oneMKL +filein = -I/usr/include/ -I${MKLROOT}/include -## LDLIBS = -L/usr/lib/x86_64-linux-gnu -lmpich -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran +## Use sequential oneMKL to avoid introducing extra OpenMP behavior into ABE. +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 -CXXAPPFLAGS = -O0 -Wno-deprecated -Dfortran3 -Dnewc -#f90appflags = -O0 -fpp -f90appflags = -O0 -x f95-cpp-input -f90 = gfortran -f77 = gfortran -CXX = g++ -CC = gcc -CLINKER = mpic++ +## Optional Intel oneTBB allocator, kept aligned with main's build environment. +USE_TBBMALLOC ?= 1 +TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so +ifneq ($(wildcard $(TBBMALLOC_SO)),) +TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed +else +TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed +endif +ifeq ($(USE_TBBMALLOC),1) +LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) +endif + +f90 = ifx +f77 = ifx +CXX = icpx +CC = icx +CLINKER = mpiicpx Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include