Refactor verification method and optimize numerical kernels with oneMKL BLAS

This commit transitions the verification approach from post-Newtonian theory comparison to regression testing against baseline simulations, and optimizes critical numerical kernels using Intel oneMKL BLAS routines. Verification Changes: - Replace PN theory-based RMS calculation with trajectory-based comparison - Compare optimized results against baseline (GW150914-origin) on XY plane - Compute RMS independently for BH1 and BH2, report maximum as final metric - Update documentation to reflect new regression test methodology Performance Optimizations: - Replace manual vector operations with oneMKL BLAS routines: * norm2() and scalarproduct() now use cblas_dnrm2/cblas_ddot (C++) * L2 norm calculations use DDOT for dot products (Fortran) * Interpolation weighted sums use DDOT (Fortran) - Disable OpenMP threading (switch to sequential MKL) for better performance Build Configuration: - Switch from lmkl_intel_thread to lmkl_sequential - Remove -qopenmp flags from compiler options - Maintain aggressive optimization flags (-O3, -xHost, -fp-model fast=2, -fma) Other Changes: - Update .gitignore for GW150914-origin, docs, and temporary files
2026-01-18 14:25:21 +08:00
parent 3a7bce3af2
commit 9deeda9831
5 changed files with 170 additions and 99 deletions
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1259,7 +1259,7 @@ end subroutine d2dump

  end subroutine polin3
 !--------------------------------------------------------------------------------------
-! calculate L2norm  
+! calculate L2norm
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw)

@@ -1276,7 +1276,9 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k,n_elements
+  real*8, dimension(:), allocatable :: f_flat
+  real*8, external :: DDOT

  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1300,7 +1302,12 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1

-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
+! Optimized with oneMKL BLAS DDOT for dot product
+n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+allocate(f_flat(n_elements))
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+deallocate(f_flat)

 f_out = f_out*dX*dY*dZ

@@ -1325,7 +1332,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k,n_elements
+  real*8, dimension(:), allocatable :: f_flat
+  real*8, external :: DDOT

  real*8 :: PIo4

@@ -1388,7 +1397,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif

-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
+! Optimized with oneMKL BLAS DDOT for dot product
+n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+allocate(f_flat(n_elements))
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+deallocate(f_flat)

 f_out = f_out*dX*dY*dZ

@@ -1416,6 +1430,8 @@ f_out = f_out*dX*dY*dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
+  real*8, dimension(:), allocatable :: f_flat
+  real*8, external :: DDOT

  real*8 :: PIo4

@@ -1478,11 +1494,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif

-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
-
-f_out = f_out
-
+! Optimized with oneMKL BLAS DDOT for dot product
 Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+allocate(f_flat(Nout))
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
+f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
+deallocate(f_flat)

  return

@@ -1680,6 +1697,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(3) :: SoAh
+  real*8, external :: DDOT

 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1715,20 +1733,21 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 

+  ! Optimized with BLAS operations for better performance
+  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo

+  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo

-  f_int=0
-  do m=1,ORDN
-    f_int = f_int + coef(m)*tmp1(m)
-  enddo
+  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)

  return

@@ -1758,6 +1777,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(2) :: SoAh
+  real*8, external :: DDOT

 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1787,15 +1807,14 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 

+  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo

-  f_int=0
-  do m=1,ORDN
-    f_int = f_int + coef(m)*tmp1(m)
-  enddo
+  ! Use BLAS DDOT for final weighted sum
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)

  return

@@ -1826,6 +1845,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN) :: ya
  real*8 :: SoAh
  integer,dimension(3) :: inds
+  real*8, external :: DDOT

 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1886,10 +1906,8 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif

-  f_int=0
-  do m=1,ORDN
-    f_int = f_int + coef(m)*ya(m)
-  enddo
+  ! Optimized with BLAS DDOT for weighted sum
+  f_int = DDOT(ORDN, coef, 1, ya, 1)

  return

@@ -2121,24 +2139,38 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)

  end function fWigner_d_function
 !----------------------------------
+! Optimized factorial function using lookup table for small N
+! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N

  real*8 :: gont
-
  integer :: i

+  ! Lookup table for factorials 0! to 20! (precomputed)
+  real*8, parameter, dimension(0:20) :: fact_table = [ &
+    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
+    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
+    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
+    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
+    2432902008176640000.d0 ]
+
 ! sanity check
  if(N < 0)then
     write(*,*) "ffact: error input for factorial"
+     gont = 1.d0
     return
  endif

-  gont = 1.d0
-  do i=1,N
-     gont = gont*i
-  enddo
+  ! Use lookup table for small N (fast path)
+  if(N <= 20)then
+     gont = fact_table(N)
+  else
+     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
+     ! This avoids overflow and is computed efficiently
+     gont = exp(log_gamma(dble(N+1)))
+  endif

  return