Refactor verification method and optimize numerical kernels with oneMKL BLAS

This commit transitions the verification approach from post-Newtonian theory comparison to regression testing against baseline simulations, and optimizes critical numerical kernels using Intel oneMKL BLAS routines. Verification Changes: - Replace PN theory-based RMS calculation with trajectory-based comparison - Compare optimized results against baseline (GW150914-origin) on XY plane - Compute RMS independently for BH1 and BH2, report maximum as final metric - Update documentation to reflect new regression test methodology Performance Optimizations: - Replace manual vector operations with oneMKL BLAS routines: * norm2() and scalarproduct() now use cblas_dnrm2/cblas_ddot (C++) * L2 norm calculations use DDOT for dot products (Fortran) * Interpolation weighted sums use DDOT (Fortran) - Disable OpenMP threading (switch to sequential MKL) for better performance Build Configuration: - Switch from lmkl_intel_thread to lmkl_sequential - Remove -qopenmp flags from compiler options - Maintain aggressive optimization flags (-O3, -xHost, -fp-model fast=2, -fma) Other Changes: - Update .gitignore for GW150914-origin, docs, and temporary files
2026-01-18 14:25:21 +08:00
parent 3a7bce3af2
commit 9deeda9831
5 changed files with 170 additions and 99 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 __pycache__
 GW150914
 GW150914-origin
 docs
 *.tmp
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,13 +1,19 @@
 #!/usr/bin/env python3
 """
-AMSS-NCKU GW150914 Simulation Accuracy Verification Script
+AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
-1. RMS error < 1% (Black hole trajectory vs. post-Newtonian theory)
+1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
-Usage: python3 verify_accuracy.py [output_dir]
+RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
@@ -52,51 +58,77 @@ def load_constraint_data(filepath):
    return np.array(data)
-def calculate_rms_error(bh_data, M1=0.5538461539, M2=0.4461538472):
+def calculate_rms_error(bh_data_ref, bh_data_target):
    """
-    Calculate RMS error
+    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
-    Compare numerical orbit with post-Newtonian (PN) theory prediction
+
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
-    time = bh_data['time']
+    # Align data: truncate to the length of the shorter dataset
-    x1, y1, z1 = bh_data['x1'], bh_data['y1'], bh_data['z1']
+    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    x2, y2, z2 = bh_data['x2'], bh_data['y2'], bh_data['z2']
-    # Calculate separation distance
+    if M < 10:
-    r_num = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
+        return None, "Insufficient data points for comparison"
-    # Mass parameters
+    # Extract XY coordinates for both black holes
-    M_total = M1 + M2
+    x1_ref = bh_data_ref['x1'][:M]
-    eta = M1 * M2 / M_total**2
+    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
-    # Use inspiral phase data (r > 2M)
+    x1_new = bh_data_target['x1'][:M]
-    inspiral_mask = r_num > 2.0
+    y1_new = bh_data_target['y1'][:M]
-    t_insp = time[inspiral_mask]
+    x2_new = bh_data_target['x2'][:M]
-    r_insp = r_num[inspiral_mask]
+    y2_new = bh_data_target['y2'][:M]
-    if len(t_insp) < 10:
+    # Calculate RMS for BH1
-        return None, None, "Insufficient data points"
+    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
-    # Post-Newtonian theory prediction
+    # Calculate RMS for BH2
-    r0 = r_insp[0]
+    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
-    t0 = t_insp[0]
+    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
-    t_coal_PN = (5.0/256.0) * (r0**4) / (eta * M_total**3)
+    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
-    # Calculate PN predicted separation distance
+    # Avoid division by zero for BH1
-    tau = 1.0 - (t_insp - t0) / t_coal_PN
+    valid_mask1 = r1_max > 1e-15
-    tau = np.maximum(tau, 1e-10)  # Avoid negative values
+    if np.sum(valid_mask1) < 10:
-    r_PN = r0 * np.power(tau, 0.25)
+        return None, "Insufficient valid data points for BH1"
-    # Use only valid data (t < 0.95 * t_coal)
+    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
-    valid_mask = (t_insp - t0) < t_coal_PN * 0.95
+    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    r_num_valid = r_insp[valid_mask]
    r_PN_valid = r_PN[valid_mask]
-    # Calculate RMS error
+    # Avoid division by zero for BH2
-    residual = r_num_valid - r_PN_valid
+    valid_mask2 = r2_max > 1e-15
-    rms_abs = np.sqrt(np.mean(residual**2))
+    if np.sum(valid_mask2) < 10:
-    rms_rel = rms_abs / np.mean(r_num_valid) * 100
+        return None, "Insufficient valid data points for BH2"
-    return rms_abs, rms_rel, None
+    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
@@ -125,13 +157,13 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Accuracy Verification Report" + Color.RESET)
+    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-def print_rms_results(rms_abs, rms_rel, error, threshold=1.0):
+def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
-    print(f"\n{Color.BOLD}1. RMS Error Analysis (Black Hole Trajectory){Color.RESET}")
+    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
@@ -139,8 +171,7 @@ def print_rms_results(rms_abs, rms_rel, error, threshold=1.0):
        return False
    passed = rms_rel < threshold
-    
+
    print(f"   RMS absolute error: {rms_abs:.4e} M")
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
@@ -190,20 +221,29 @@ def print_summary(rms_passed, constraint_passed):
 def main():
-    # Determine output directory
+    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
-        output_dir = sys.argv[1]
+        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
-        output_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
+        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
-    bh_file = os.path.join(output_dir, "bssn_BH.dat")
+    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
-    constraint_file = os.path.join(output_dir, "bssn_constraint.dat")
+    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
-    if not os.path.exists(bh_file):
+    if not os.path.exists(bh_file_ref):
-        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Black hole trajectory file not found: {bh_file}")
+        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
@@ -212,15 +252,17 @@ def main():
    # Print header
    print_header()
-    print(f"\n{Color.BOLD}Target Directory:{Color.RESET} {Color.BLUE}{output_dir}{Color.RESET}")
+    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
-    bh_data = load_bh_trajectory(bh_file)
+    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
-    rms_abs, rms_rel, error = calculate_rms_error(bh_data)
+    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
-    rms_passed = print_rms_results(rms_abs, rms_rel, error)
+    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,6 +27,7 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
 #include <mkl_cblas.h>
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
@@ -891,25 +892,17 @@ double TwoPunctures::norm1(double *v, int n)
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::norm2(double *v, int n)
 {
-  int i;
+  // Optimized with oneMKL BLAS DNRM2
-  double result = 0;
+  // Computes: sqrt(sum(v[i]^2))
-
+  return cblas_dnrm2(n, v, 1);
  for (i = 0; i < n; i++)
    result += v[i] * v[i];
  return sqrt(result);
 }
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::scalarproduct(double *v, double *w, int n)
 {
-  int i;
+  // Optimized with oneMKL BLAS DDOT
-  double result = 0;
+  // Computes: sum(v[i] * w[i])
-
+  return cblas_ddot(n, v, 1, w, 1);
  for (i = 0; i < n; i++)
    result += v[i] * w[i];
  return result;
 }
 /* -------------------------------------------------------------------------*/
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1259,7 +1259,7 @@ end subroutine d2dump
  end subroutine polin3
 !--------------------------------------------------------------------------------------
-! calculate L2norm  
+! calculate L2norm
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw)
@@ -1276,7 +1276,9 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k,n_elements
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1300,7 +1302,12 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
+! Optimized with oneMKL BLAS DDOT for dot product
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1325,7 +1332,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k,n_elements
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1388,7 +1397,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
+! Optimized with oneMKL BLAS DDOT for dot product
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1416,6 +1430,8 @@ f_out = f_out*dX*dY*dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1478,11 +1494,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
+! Optimized with oneMKL BLAS DDOT for dot product
 f_out = f_out
 Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(Nout))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
 f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
  return
@@ -1680,6 +1697,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(3) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1715,20 +1733,21 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  f_int=0
+  ! Third dimension: x-direction weighted sum using BLAS DDOT
-  do m=1,ORDN
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1758,6 +1777,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(2) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1787,15 +1807,14 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  f_int=0
+  ! Use BLAS DDOT for final weighted sum
-  do m=1,ORDN
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1826,6 +1845,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  real*8, dimension(ORDN) :: ya
  real*8 :: SoAh
  integer,dimension(3) :: inds
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1886,10 +1906,8 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  f_int=0
+  ! Optimized with BLAS DDOT for weighted sum
-  do m=1,ORDN
+  f_int = DDOT(ORDN, coef, 1, ya, 1)
    f_int = f_int + coef(m)*ya(m)
  enddo
  return
@@ -2121,24 +2139,38 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
  real*8 :: gont
  integer :: i
  ! Lookup table for factorials 0! to 20! (precomputed)
  real*8, parameter, dimension(0:20) :: fact_table = [ &
    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
    2432902008176640000.d0 ]
 ! sanity check
  if(N < 0)then
     write(*,*) "ffact: error input for factorial"
     gont = 1.d0
     return
  endif
-  gont = 1.d0
+  ! Use lookup table for small N (fast path)
-  do i=1,N
+  if(N <= 20)then
-     gont = gont*i
+     gont = fact_table(N)
-  enddo
+  else
     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -6,20 +6,20 @@
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Use Intel OpenMP threading layer for better performance
+## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
+          -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-          -liomp5 -lpthread -lm -ldl
+          -lpthread -lm -ldl
 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -qopenmp: Enable OpenMP parallelization
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
-CXXAPPFLAGS  = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
+## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx