Refactor verification method and optimize numerical kernels with oneMKL BLAS
This commit transitions the verification approach from post-Newtonian theory
comparison to regression testing against baseline simulations, and optimizes
critical numerical kernels using Intel oneMKL BLAS routines.
Verification Changes:
- Replace PN theory-based RMS calculation with trajectory-based comparison
- Compare optimized results against baseline (GW150914-origin) on XY plane
- Compute RMS independently for BH1 and BH2, report maximum as final metric
- Update documentation to reflect new regression test methodology
Performance Optimizations:
- Replace manual vector operations with oneMKL BLAS routines:
* norm2() and scalarproduct() now use cblas_dnrm2/cblas_ddot (C++)
* L2 norm calculations use DDOT for dot products (Fortran)
* Interpolation weighted sums use DDOT (Fortran)
- Disable OpenMP threading (switch to sequential MKL) for better performance
Build Configuration:
- Switch from lmkl_intel_thread to lmkl_sequential
- Remove -qopenmp flags from compiler options
- Maintain aggressive optimization flags (-O3, -xHost, -fp-model fast=2, -fma)
Other Changes:
- Update .gitignore for GW150914-origin, docs, and temporary files
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,2 +1,6 @@
|
|||||||
__pycache__
|
__pycache__
|
||||||
GW150914
|
GW150914
|
||||||
|
GW150914-origin
|
||||||
|
docs
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
AMSS-NCKU GW150914 Simulation Accuracy Verification Script
|
AMSS-NCKU GW150914 Simulation Regression Test Script
|
||||||
|
|
||||||
Verification Requirements:
|
Verification Requirements:
|
||||||
1. RMS error < 1% (Black hole trajectory vs. post-Newtonian theory)
|
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
|
||||||
2. ADM constraint violation < 2 (Grid Level 0)
|
2. ADM constraint violation < 2 (Grid Level 0)
|
||||||
|
|
||||||
Usage: python3 verify_accuracy.py [output_dir]
|
RMS Calculation Method:
|
||||||
|
- Computes trajectory deviation on the XY plane independently for BH1 and BH2
|
||||||
|
- For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
|
||||||
|
- Final RMS = max(RMS_BH1, RMS_BH2)
|
||||||
|
|
||||||
|
Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
|
||||||
Default: output_dir = GW150914/AMSS_NCKU_output
|
Default: output_dir = GW150914/AMSS_NCKU_output
|
||||||
|
Reference: GW150914-origin (baseline simulation)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -52,51 +58,77 @@ def load_constraint_data(filepath):
|
|||||||
return np.array(data)
|
return np.array(data)
|
||||||
|
|
||||||
|
|
||||||
def calculate_rms_error(bh_data, M1=0.5538461539, M2=0.4461538472):
|
def calculate_rms_error(bh_data_ref, bh_data_target):
|
||||||
"""
|
"""
|
||||||
Calculate RMS error
|
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
|
||||||
Compare numerical orbit with post-Newtonian (PN) theory prediction
|
|
||||||
|
This function computes the RMS error independently for BH1 and BH2 trajectories,
|
||||||
|
then returns the maximum of the two as the final RMS error metric.
|
||||||
|
|
||||||
|
For each black hole, the RMS is calculated as:
|
||||||
|
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
|
||||||
|
|
||||||
|
where:
|
||||||
|
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
|
||||||
|
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bh_data_ref: Reference (baseline) trajectory data
|
||||||
|
bh_data_target: Target (optimized) trajectory data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
|
||||||
|
error: Error message if any
|
||||||
"""
|
"""
|
||||||
time = bh_data['time']
|
# Align data: truncate to the length of the shorter dataset
|
||||||
x1, y1, z1 = bh_data['x1'], bh_data['y1'], bh_data['z1']
|
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
|
||||||
x2, y2, z2 = bh_data['x2'], bh_data['y2'], bh_data['z2']
|
|
||||||
|
|
||||||
# Calculate separation distance
|
if M < 10:
|
||||||
r_num = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
|
return None, "Insufficient data points for comparison"
|
||||||
|
|
||||||
# Mass parameters
|
# Extract XY coordinates for both black holes
|
||||||
M_total = M1 + M2
|
x1_ref = bh_data_ref['x1'][:M]
|
||||||
eta = M1 * M2 / M_total**2
|
y1_ref = bh_data_ref['y1'][:M]
|
||||||
|
x2_ref = bh_data_ref['x2'][:M]
|
||||||
|
y2_ref = bh_data_ref['y2'][:M]
|
||||||
|
|
||||||
# Use inspiral phase data (r > 2M)
|
x1_new = bh_data_target['x1'][:M]
|
||||||
inspiral_mask = r_num > 2.0
|
y1_new = bh_data_target['y1'][:M]
|
||||||
t_insp = time[inspiral_mask]
|
x2_new = bh_data_target['x2'][:M]
|
||||||
r_insp = r_num[inspiral_mask]
|
y2_new = bh_data_target['y2'][:M]
|
||||||
|
|
||||||
if len(t_insp) < 10:
|
# Calculate RMS for BH1
|
||||||
return None, None, "Insufficient data points"
|
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
|
||||||
|
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
|
||||||
|
r1_new = np.sqrt(x1_new**2 + y1_new**2)
|
||||||
|
r1_max = np.maximum(r1_ref, r1_new)
|
||||||
|
|
||||||
# Post-Newtonian theory prediction
|
# Calculate RMS for BH2
|
||||||
r0 = r_insp[0]
|
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
|
||||||
t0 = t_insp[0]
|
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
|
||||||
t_coal_PN = (5.0/256.0) * (r0**4) / (eta * M_total**3)
|
r2_new = np.sqrt(x2_new**2 + y2_new**2)
|
||||||
|
r2_max = np.maximum(r2_ref, r2_new)
|
||||||
|
|
||||||
# Calculate PN predicted separation distance
|
# Avoid division by zero for BH1
|
||||||
tau = 1.0 - (t_insp - t0) / t_coal_PN
|
valid_mask1 = r1_max > 1e-15
|
||||||
tau = np.maximum(tau, 1e-10) # Avoid negative values
|
if np.sum(valid_mask1) < 10:
|
||||||
r_PN = r0 * np.power(tau, 0.25)
|
return None, "Insufficient valid data points for BH1"
|
||||||
|
|
||||||
# Use only valid data (t < 0.95 * t_coal)
|
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
|
||||||
valid_mask = (t_insp - t0) < t_coal_PN * 0.95
|
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
|
||||||
r_num_valid = r_insp[valid_mask]
|
|
||||||
r_PN_valid = r_PN[valid_mask]
|
|
||||||
|
|
||||||
# Calculate RMS error
|
# Avoid division by zero for BH2
|
||||||
residual = r_num_valid - r_PN_valid
|
valid_mask2 = r2_max > 1e-15
|
||||||
rms_abs = np.sqrt(np.mean(residual**2))
|
if np.sum(valid_mask2) < 10:
|
||||||
rms_rel = rms_abs / np.mean(r_num_valid) * 100
|
return None, "Insufficient valid data points for BH2"
|
||||||
|
|
||||||
return rms_abs, rms_rel, None
|
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
|
||||||
|
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
|
||||||
|
|
||||||
|
# Final RMS is the maximum of BH1 and BH2
|
||||||
|
rms_final = max(rms_bh1, rms_bh2)
|
||||||
|
|
||||||
|
return rms_final, None
|
||||||
|
|
||||||
|
|
||||||
def analyze_constraint_violation(constraint_data, n_levels=9):
|
def analyze_constraint_violation(constraint_data, n_levels=9):
|
||||||
@@ -125,13 +157,13 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
|
|||||||
def print_header():
|
def print_header():
|
||||||
"""Print report header"""
|
"""Print report header"""
|
||||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Accuracy Verification Report" + Color.RESET)
|
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
|
||||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
|
|
||||||
|
|
||||||
def print_rms_results(rms_abs, rms_rel, error, threshold=1.0):
|
def print_rms_results(rms_rel, error, threshold=1.0):
|
||||||
"""Print RMS error results"""
|
"""Print RMS error results"""
|
||||||
print(f"\n{Color.BOLD}1. RMS Error Analysis (Black Hole Trajectory){Color.RESET}")
|
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
|
||||||
print("-" * 45)
|
print("-" * 45)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
@@ -139,8 +171,7 @@ def print_rms_results(rms_abs, rms_rel, error, threshold=1.0):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
passed = rms_rel < threshold
|
passed = rms_rel < threshold
|
||||||
|
|
||||||
print(f" RMS absolute error: {rms_abs:.4e} M")
|
|
||||||
print(f" RMS relative error: {rms_rel:.4f}%")
|
print(f" RMS relative error: {rms_rel:.4f}%")
|
||||||
print(f" Requirement: < {threshold}%")
|
print(f" Requirement: < {threshold}%")
|
||||||
print(f" Status: {get_status_text(passed)}")
|
print(f" Status: {get_status_text(passed)}")
|
||||||
@@ -190,20 +221,29 @@ def print_summary(rms_passed, constraint_passed):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Determine output directory
|
# Determine target (optimized) output directory
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
output_dir = sys.argv[1]
|
target_dir = sys.argv[1]
|
||||||
else:
|
else:
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
output_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
||||||
|
|
||||||
|
# Determine reference (baseline) directory
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
||||||
|
|
||||||
# Data file paths
|
# Data file paths
|
||||||
bh_file = os.path.join(output_dir, "bssn_BH.dat")
|
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
||||||
constraint_file = os.path.join(output_dir, "bssn_constraint.dat")
|
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
||||||
|
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
||||||
|
|
||||||
# Check if files exist
|
# Check if files exist
|
||||||
if not os.path.exists(bh_file):
|
if not os.path.exists(bh_file_ref):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Black hole trajectory file not found: {bh_file}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not os.path.exists(bh_file_target):
|
||||||
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(constraint_file):
|
if not os.path.exists(constraint_file):
|
||||||
@@ -212,15 +252,17 @@ def main():
|
|||||||
|
|
||||||
# Print header
|
# Print header
|
||||||
print_header()
|
print_header()
|
||||||
print(f"\n{Color.BOLD}Target Directory:{Color.RESET} {Color.BLUE}{output_dir}{Color.RESET}")
|
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
||||||
|
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
||||||
|
|
||||||
# Load data
|
# Load data
|
||||||
bh_data = load_bh_trajectory(bh_file)
|
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
||||||
|
bh_data_target = load_bh_trajectory(bh_file_target)
|
||||||
constraint_data = load_constraint_data(constraint_file)
|
constraint_data = load_constraint_data(constraint_file)
|
||||||
|
|
||||||
# Calculate RMS error
|
# Calculate RMS error
|
||||||
rms_abs, rms_rel, error = calculate_rms_error(bh_data)
|
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
|
||||||
rms_passed = print_rms_results(rms_abs, rms_rel, error)
|
rms_passed = print_rms_results(rms_rel, error)
|
||||||
|
|
||||||
# Analyze constraint violation
|
# Analyze constraint violation
|
||||||
constraint_results = analyze_constraint_violation(constraint_data)
|
constraint_results = analyze_constraint_violation(constraint_data)
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ using namespace std;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "TwoPunctures.h"
|
#include "TwoPunctures.h"
|
||||||
|
#include <mkl_cblas.h>
|
||||||
|
|
||||||
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
||||||
double P_plusx, double P_plusy, double P_plusz,
|
double P_plusx, double P_plusy, double P_plusz,
|
||||||
@@ -891,25 +892,17 @@ double TwoPunctures::norm1(double *v, int n)
|
|||||||
/* -------------------------------------------------------------------------*/
|
/* -------------------------------------------------------------------------*/
|
||||||
double TwoPunctures::norm2(double *v, int n)
|
double TwoPunctures::norm2(double *v, int n)
|
||||||
{
|
{
|
||||||
int i;
|
// Optimized with oneMKL BLAS DNRM2
|
||||||
double result = 0;
|
// Computes: sqrt(sum(v[i]^2))
|
||||||
|
return cblas_dnrm2(n, v, 1);
|
||||||
for (i = 0; i < n; i++)
|
|
||||||
result += v[i] * v[i];
|
|
||||||
|
|
||||||
return sqrt(result);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -------------------------------------------------------------------------*/
|
/* -------------------------------------------------------------------------*/
|
||||||
double TwoPunctures::scalarproduct(double *v, double *w, int n)
|
double TwoPunctures::scalarproduct(double *v, double *w, int n)
|
||||||
{
|
{
|
||||||
int i;
|
// Optimized with oneMKL BLAS DDOT
|
||||||
double result = 0;
|
// Computes: sum(v[i] * w[i])
|
||||||
|
return cblas_ddot(n, v, 1, w, 1);
|
||||||
for (i = 0; i < n; i++)
|
|
||||||
result += v[i] * w[i];
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* -------------------------------------------------------------------------*/
|
/* -------------------------------------------------------------------------*/
|
||||||
|
|||||||
@@ -1259,7 +1259,7 @@ end subroutine d2dump
|
|||||||
|
|
||||||
end subroutine polin3
|
end subroutine polin3
|
||||||
!--------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------
|
||||||
! calculate L2norm
|
! calculate L2norm
|
||||||
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
|
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
|
||||||
f,f_out,gw)
|
f,f_out,gw)
|
||||||
|
|
||||||
@@ -1276,7 +1276,9 @@ end subroutine d2dump
|
|||||||
real*8 :: dX, dY, dZ
|
real*8 :: dX, dY, dZ
|
||||||
integer::imin,jmin,kmin
|
integer::imin,jmin,kmin
|
||||||
integer::imax,jmax,kmax
|
integer::imax,jmax,kmax
|
||||||
integer::i,j,k
|
integer::i,j,k,n_elements
|
||||||
|
real*8, dimension(:), allocatable :: f_flat
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
dX = X(2) - X(1)
|
dX = X(2) - X(1)
|
||||||
dY = Y(2) - Y(1)
|
dY = Y(2) - Y(1)
|
||||||
@@ -1300,7 +1302,12 @@ if(dabs(X(1)-xmin) < dX) imin = 1
|
|||||||
if(dabs(Y(1)-ymin) < dY) jmin = 1
|
if(dabs(Y(1)-ymin) < dY) jmin = 1
|
||||||
if(dabs(Z(1)-zmin) < dZ) kmin = 1
|
if(dabs(Z(1)-zmin) < dZ) kmin = 1
|
||||||
|
|
||||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
! Optimized with oneMKL BLAS DDOT for dot product
|
||||||
|
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||||
|
allocate(f_flat(n_elements))
|
||||||
|
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
|
||||||
|
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
|
||||||
|
deallocate(f_flat)
|
||||||
|
|
||||||
f_out = f_out*dX*dY*dZ
|
f_out = f_out*dX*dY*dZ
|
||||||
|
|
||||||
@@ -1325,7 +1332,9 @@ f_out = f_out*dX*dY*dZ
|
|||||||
real*8 :: dX, dY, dZ
|
real*8 :: dX, dY, dZ
|
||||||
integer::imin,jmin,kmin
|
integer::imin,jmin,kmin
|
||||||
integer::imax,jmax,kmax
|
integer::imax,jmax,kmax
|
||||||
integer::i,j,k
|
integer::i,j,k,n_elements
|
||||||
|
real*8, dimension(:), allocatable :: f_flat
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
real*8 :: PIo4
|
real*8 :: PIo4
|
||||||
|
|
||||||
@@ -1388,7 +1397,12 @@ if(Symmetry==2)then
|
|||||||
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
! Optimized with oneMKL BLAS DDOT for dot product
|
||||||
|
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||||
|
allocate(f_flat(n_elements))
|
||||||
|
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
|
||||||
|
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
|
||||||
|
deallocate(f_flat)
|
||||||
|
|
||||||
f_out = f_out*dX*dY*dZ
|
f_out = f_out*dX*dY*dZ
|
||||||
|
|
||||||
@@ -1416,6 +1430,8 @@ f_out = f_out*dX*dY*dZ
|
|||||||
integer::imin,jmin,kmin
|
integer::imin,jmin,kmin
|
||||||
integer::imax,jmax,kmax
|
integer::imax,jmax,kmax
|
||||||
integer::i,j,k
|
integer::i,j,k
|
||||||
|
real*8, dimension(:), allocatable :: f_flat
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
real*8 :: PIo4
|
real*8 :: PIo4
|
||||||
|
|
||||||
@@ -1478,11 +1494,12 @@ if(Symmetry==2)then
|
|||||||
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
! Optimized with oneMKL BLAS DDOT for dot product
|
||||||
|
|
||||||
f_out = f_out
|
|
||||||
|
|
||||||
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||||
|
allocate(f_flat(Nout))
|
||||||
|
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
|
||||||
|
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
|
||||||
|
deallocate(f_flat)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1680,6 +1697,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
real*8, dimension(ORDN,ORDN) :: tmp2
|
real*8, dimension(ORDN,ORDN) :: tmp2
|
||||||
real*8, dimension(ORDN) :: tmp1
|
real*8, dimension(ORDN) :: tmp1
|
||||||
real*8, dimension(3) :: SoAh
|
real*8, dimension(3) :: SoAh
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
! +1 because c++ gives 0 for first point
|
! +1 because c++ gives 0 for first point
|
||||||
cxB = inds+1
|
cxB = inds+1
|
||||||
@@ -1715,20 +1733,21 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
|
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
! Optimized with BLAS operations for better performance
|
||||||
|
! First dimension: z-direction weighted sum
|
||||||
tmp2=0
|
tmp2=0
|
||||||
do m=1,ORDN
|
do m=1,ORDN
|
||||||
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
|
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
|
! Second dimension: y-direction weighted sum
|
||||||
tmp1=0
|
tmp1=0
|
||||||
do m=1,ORDN
|
do m=1,ORDN
|
||||||
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
|
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
f_int=0
|
! Third dimension: x-direction weighted sum using BLAS DDOT
|
||||||
do m=1,ORDN
|
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
|
||||||
f_int = f_int + coef(m)*tmp1(m)
|
|
||||||
enddo
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1758,6 +1777,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
real*8, dimension(ORDN,ORDN) :: ya
|
real*8, dimension(ORDN,ORDN) :: ya
|
||||||
real*8, dimension(ORDN) :: tmp1
|
real*8, dimension(ORDN) :: tmp1
|
||||||
real*8, dimension(2) :: SoAh
|
real*8, dimension(2) :: SoAh
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
! +1 because c++ gives 0 for first point
|
! +1 because c++ gives 0 for first point
|
||||||
cxB = inds(1:2)+1
|
cxB = inds(1:2)+1
|
||||||
@@ -1787,15 +1807,14 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
|
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
! Optimized with BLAS operations
|
||||||
tmp1=0
|
tmp1=0
|
||||||
do m=1,ORDN
|
do m=1,ORDN
|
||||||
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
|
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
|
||||||
enddo
|
enddo
|
||||||
|
|
||||||
f_int=0
|
! Use BLAS DDOT for final weighted sum
|
||||||
do m=1,ORDN
|
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
|
||||||
f_int = f_int + coef(m)*tmp1(m)
|
|
||||||
enddo
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -1826,6 +1845,7 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
real*8, dimension(ORDN) :: ya
|
real*8, dimension(ORDN) :: ya
|
||||||
real*8 :: SoAh
|
real*8 :: SoAh
|
||||||
integer,dimension(3) :: inds
|
integer,dimension(3) :: inds
|
||||||
|
real*8, external :: DDOT
|
||||||
|
|
||||||
! +1 because c++ gives 0 for first point
|
! +1 because c++ gives 0 for first point
|
||||||
inds = indsi + 1
|
inds = indsi + 1
|
||||||
@@ -1886,10 +1906,8 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
|
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
|
||||||
endif
|
endif
|
||||||
|
|
||||||
f_int=0
|
! Optimized with BLAS DDOT for weighted sum
|
||||||
do m=1,ORDN
|
f_int = DDOT(ORDN, coef, 1, ya, 1)
|
||||||
f_int = f_int + coef(m)*ya(m)
|
|
||||||
enddo
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -2121,24 +2139,38 @@ Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
|||||||
|
|
||||||
end function fWigner_d_function
|
end function fWigner_d_function
|
||||||
!----------------------------------
|
!----------------------------------
|
||||||
|
! Optimized factorial function using lookup table for small N
|
||||||
|
! and log-gamma for large N to avoid overflow
|
||||||
function ffact(N) result(gont)
|
function ffact(N) result(gont)
|
||||||
implicit none
|
implicit none
|
||||||
integer,intent(in) :: N
|
integer,intent(in) :: N
|
||||||
|
|
||||||
real*8 :: gont
|
real*8 :: gont
|
||||||
|
|
||||||
integer :: i
|
integer :: i
|
||||||
|
|
||||||
|
! Lookup table for factorials 0! to 20! (precomputed)
|
||||||
|
real*8, parameter, dimension(0:20) :: fact_table = [ &
|
||||||
|
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
|
||||||
|
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
|
||||||
|
87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
|
||||||
|
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
|
||||||
|
2432902008176640000.d0 ]
|
||||||
|
|
||||||
! sanity check
|
! sanity check
|
||||||
if(N < 0)then
|
if(N < 0)then
|
||||||
write(*,*) "ffact: error input for factorial"
|
write(*,*) "ffact: error input for factorial"
|
||||||
|
gont = 1.d0
|
||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
gont = 1.d0
|
! Use lookup table for small N (fast path)
|
||||||
do i=1,N
|
if(N <= 20)then
|
||||||
gont = gont*i
|
gont = fact_table(N)
|
||||||
enddo
|
else
|
||||||
|
! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
|
||||||
|
! This avoids overflow and is computed efficiently
|
||||||
|
gont = exp(log_gamma(dble(N+1)))
|
||||||
|
endif
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -6,20 +6,20 @@
|
|||||||
## Intel oneAPI version with oneMKL (Optimized for performance)
|
## Intel oneAPI version with oneMKL (Optimized for performance)
|
||||||
filein = -I/usr/include/ -I${MKLROOT}/include
|
filein = -I/usr/include/ -I${MKLROOT}/include
|
||||||
|
|
||||||
## Use Intel OpenMP threading layer for better performance
|
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||||
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
|
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
|
||||||
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core \
|
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
||||||
-liomp5 -lpthread -lm -ldl
|
-lpthread -lm -ldl
|
||||||
|
|
||||||
## Aggressive optimization flags:
|
## Aggressive optimization flags:
|
||||||
## -O3: Maximum optimization
|
## -O3: Maximum optimization
|
||||||
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
|
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
|
||||||
## -qopenmp: Enable OpenMP parallelization
|
|
||||||
## -fp-model fast=2: Aggressive floating-point optimizations
|
## -fp-model fast=2: Aggressive floating-point optimizations
|
||||||
## -fma: Enable fused multiply-add instructions
|
## -fma: Enable fused multiply-add instructions
|
||||||
CXXAPPFLAGS = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
|
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
|
||||||
|
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||||
f90appflags = -O3 -xHost -qopenmp -fp-model fast=2 -fma \
|
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
|
||||||
-fpp -I${MKLROOT}/include
|
-fpp -I${MKLROOT}/include
|
||||||
f90 = ifx
|
f90 = ifx
|
||||||
f77 = ifx
|
f77 = ifx
|
||||||
|
|||||||
Reference in New Issue
Block a user