Compare commits

..

5 Commits

23 changed files with 2059 additions and 1362 deletions

7
.gitignore vendored
View File

@@ -1,6 +1,5 @@
__pycache__ __pycache__
GW150914 GW150914
GW150914-origin GW150914*
docs .codex
*.tmp docs/

View File

@@ -9,9 +9,19 @@
################################################################## ##################################################################
################################################################## ##################################################################
## Print program introduction ## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script.
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
##################################################################
## Print program introduction
import print_information import print_information
@@ -422,31 +432,36 @@ print( " Plotting the txt and binary results data from the AMSS-NCKU simulation
print( ) print( )
import plot_xiaoqu import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
## Plot black hole trajectory
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory ) plot_tasks = []
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole trajectory
## Plot black hole separation vs. time plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory ) plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
## Plot gravitational waveforms (psi4 and strain amplitude) ## Plot black hole separation vs. time
for i in range(input_data.Detector_Number): plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i ) ## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number):
## Plot ADM mass evolution plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
for i in range(input_data.Detector_Number): plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution
## Plot Hamiltonian constraint violation over time for i in range(input_data.Detector_Number):
for i in range(input_data.grid_level): plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time
## Plot stored binary data for i in range(input_data.grid_level):
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory ) plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
run_plot_tasks_parallel(plot_tasks)
## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
print( ) print( )
print( f" This Program Cost = {elapsed_time} Seconds " ) print( f" This Program Cost = {elapsed_time} Seconds " )

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env python3
"""
AMSS-NCKU GW150914 Simulation Regression Test Script
Verification Requirements:
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
2. ADM constraint violation < 2 (Grid Level 0)
RMS Calculation Method:
- Computes trajectory deviation on the XY plane independently for BH1 and BH2
- For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
- Final RMS = max(RMS_BH1, RMS_BH2)
Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
Default: output_dir = GW150914/AMSS_NCKU_output
Reference: GW150914-origin (baseline simulation)
"""
import numpy as np
import sys
import os
# ANSI Color Codes
class Color:
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
BOLD = '\033[1m'
RESET = '\033[0m'
def get_status_text(passed):
if passed:
return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
else:
return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
def load_bh_trajectory(filepath):
"""Load black hole trajectory data"""
data = np.loadtxt(filepath)
return {
'time': data[:, 0],
'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
}
def load_constraint_data(filepath):
"""Load constraint violation data"""
data = []
with open(filepath, 'r') as f:
for line in f:
if line.startswith('#'):
continue
parts = line.split()
if len(parts) >= 8:
data.append([float(x) for x in parts[:8]])
return np.array(data)
def calculate_rms_error(bh_data_ref, bh_data_target):
"""
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
This function computes the RMS error independently for BH1 and BH2 trajectories,
then returns the maximum of the two as the final RMS error metric.
For each black hole, the RMS is calculated as:
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
where:
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
Args:
bh_data_ref: Reference (baseline) trajectory data
bh_data_target: Target (optimized) trajectory data
Returns:
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
error: Error message if any
"""
# Align data: truncate to the length of the shorter dataset
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
if M < 10:
return None, "Insufficient data points for comparison"
# Extract XY coordinates for both black holes
x1_ref = bh_data_ref['x1'][:M]
y1_ref = bh_data_ref['y1'][:M]
x2_ref = bh_data_ref['x2'][:M]
y2_ref = bh_data_ref['y2'][:M]
x1_new = bh_data_target['x1'][:M]
y1_new = bh_data_target['y1'][:M]
x2_new = bh_data_target['x2'][:M]
y2_new = bh_data_target['y2'][:M]
# Calculate RMS for BH1
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
r1_new = np.sqrt(x1_new**2 + y1_new**2)
r1_max = np.maximum(r1_ref, r1_new)
# Calculate RMS for BH2
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
r2_new = np.sqrt(x2_new**2 + y2_new**2)
r2_max = np.maximum(r2_ref, r2_new)
# Avoid division by zero for BH1
valid_mask1 = r1_max > 1e-15
if np.sum(valid_mask1) < 10:
return None, "Insufficient valid data points for BH1"
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
# Avoid division by zero for BH2
valid_mask2 = r2_max > 1e-15
if np.sum(valid_mask2) < 10:
return None, "Insufficient valid data points for BH2"
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
# Final RMS is the maximum of BH1 and BH2
rms_final = max(rms_bh1, rms_bh2)
return rms_final, None
def analyze_constraint_violation(constraint_data, n_levels=9):
"""
Analyze ADM constraint violation
Return maximum constraint violation for Grid Level 0
"""
# Extract Grid Level 0 data (first entry for each time step)
level0_data = constraint_data[::n_levels]
# Calculate maximum absolute value for each constraint
results = {
'Ham': np.max(np.abs(level0_data[:, 1])),
'Px': np.max(np.abs(level0_data[:, 2])),
'Py': np.max(np.abs(level0_data[:, 3])),
'Pz': np.max(np.abs(level0_data[:, 4])),
'Gx': np.max(np.abs(level0_data[:, 5])),
'Gy': np.max(np.abs(level0_data[:, 6])),
'Gz': np.max(np.abs(level0_data[:, 7]))
}
results['max_violation'] = max(results.values())
return results
def print_header():
"""Print report header"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
def print_rms_results(rms_rel, error, threshold=1.0):
"""Print RMS error results"""
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
print("-" * 45)
if error:
print(f" {Color.RED}Error: {error}{Color.RESET}")
return False
passed = rms_rel < threshold
print(f" RMS relative error: {rms_rel:.4f}%")
print(f" Requirement: < {threshold}%")
print(f" Status: {get_status_text(passed)}")
return passed
def print_constraint_results(results, threshold=2.0):
"""Print constraint violation results"""
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
print("-" * 45)
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
for i, name in enumerate(names):
print(f" Max |{name:3}|: {results[name]:.6f}", end=" ")
if (i + 1) % 2 == 0: print()
if len(names) % 2 != 0: print()
passed = results['max_violation'] < threshold
print(f"\n Maximum violation: {results['max_violation']:.6f}")
print(f" Requirement: < {threshold}")
print(f" Status: {get_status_text(passed)}")
return passed
def print_summary(rms_passed, constraint_passed):
"""Print summary"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + "Verification Summary" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
all_passed = rms_passed and constraint_passed
res_rms = get_status_text(rms_passed)
res_con = get_status_text(constraint_passed)
print(f" [1] RMS trajectory check: {res_rms}")
print(f" [2] ADM constraint check: {res_con}")
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
print(f"\n Overall result: {final_status}")
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
return all_passed
def main():
# Determine target (optimized) output directory
if len(sys.argv) > 1:
target_dir = sys.argv[1]
else:
script_dir = os.path.dirname(os.path.abspath(__file__))
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
# Determine reference (baseline) directory
script_dir = os.path.dirname(os.path.abspath(__file__))
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
# Data file paths
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
# Check if files exist
if not os.path.exists(bh_file_ref):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
sys.exit(1)
if not os.path.exists(bh_file_target):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
sys.exit(1)
if not os.path.exists(constraint_file):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
sys.exit(1)
# Print header
print_header()
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
# Load data
bh_data_ref = load_bh_trajectory(bh_file_ref)
bh_data_target = load_bh_trajectory(bh_file_target)
constraint_data = load_constraint_data(constraint_file)
# Calculate RMS error
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
rms_passed = print_rms_results(rms_rel, error)
# Analyze constraint violation
constraint_results = analyze_constraint_violation(constraint_data)
constraint_passed = print_constraint_results(constraint_results)
# Print summary
all_passed = print_summary(rms_passed, constraint_passed)
# Return exit code
sys.exit(0 if all_passed else 1)
if __name__ == "__main__":
main()

View File

@@ -37,51 +37,57 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!------------- !-------------
SUBROUTINE four1(dataa,nn,isign) SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none implicit none
INTEGER, intent(in) :: isign, nn INTEGER::isign,nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
type(DFTI_DESCRIPTOR), pointer :: desc double precision::tempi,tempr
integer :: status DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
! Create DFTI descriptor for 1D complex-to-complex transform j=1
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) do i=1,n,2
if (status /= 0) return if(j.gt.i)then
tempr=dataa(j)
! Set input/output storage as interleaved complex (default) tempi=dataa(j+1)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) dataa(j)=dataa(i)
if (status /= 0) then dataa(j+1)=dataa(i+1)
status = DftiFreeDescriptor(desc) dataa(i)=tempr
return dataa(i+1)=tempi
endif
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return return
END SUBROUTINE four1 END SUBROUTINE four1

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,8 @@
#ifndef TWO_PUNCTURES_H #ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H #define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19 #define StencilSize 19
#define N_PlaneRelax 1 #define N_PlaneRelax 1
#define NRELAX 200 #define NRELAX 200
@@ -32,7 +33,7 @@ private:
int npoints_A, npoints_B, npoints_phi; int npoints_A, npoints_B, npoints_phi;
double target_M_plus, target_M_minus; double target_M_plus, target_M_minus;
double admMass; double admMass;
double adm_tol; double adm_tol;
@@ -42,32 +43,17 @@ private:
int ntotal; int ntotal;
// Pre-allocated workspace buffers for hot-path allocation elimination // ===== Precomputed spectral derivative matrices =====
// LineRelax_be workspace (sized for n2) double *D1_A, *D2_A;
double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be; double *D1_B, *D2_B;
// LineRelax_al workspace (sized for n1) double *DF1_phi, *DF2_phi;
double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
// ThomasAlgorithm workspace (sized for max(n1,n2)) // ===== Pre-allocated workspace for LineRelax (per-thread) =====
double *ws_thomas_y; int max_threads;
// JFD_times_dv workspace (sized for nvar) double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
double *ws_jfd_values; double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
derivs ws_jfd_dU, ws_jfd_U; double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
// chebft_Zeros workspace (sized for max(n1,n2,n3)+1) double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
double *ws_cheb_c;
// fourft workspace (sized for max(n1,n2,n3)/2+1 each)
double *ws_four_a, *ws_four_b;
// Derivatives_AB3 workspace
double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
double *ws_deriv_q, *ws_deriv_dq;
double *ws_deriv_r, *ws_deriv_dr;
int *ws_deriv_indx;
// F_of_v workspace
double *ws_fov_sources;
double *ws_fov_values;
derivs ws_fov_U;
// J_times_dv workspace
double *ws_jtdv_values;
derivs ws_jtdv_dU, ws_jtdv_U;
struct parameters struct parameters
{ {
@@ -85,6 +71,28 @@ public:
int Newtonmaxit); int Newtonmaxit);
~TwoPunctures(); ~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve(); void Solve();
void set_initial_guess(derivs v); void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d); int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -143,23 +151,11 @@ public:
double BY_KKofxyz(double x, double y, double z); double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values); int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R, void LinEquations(double A, double B, double X, double R,
double x, double r, double phi, double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values); double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname); void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu) // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -168,4 +164,4 @@ public:
void SpecCoef(parameters par, int ivar, double *v, double *cf); void SpecCoef(parameters par, int ivar, double *v, double *cf);
}; };
#endif /* TWO_PUNCTURES_H */ #endif /* TWO_PUNCTURES_H */

View File

@@ -106,8 +106,7 @@
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
#endif #endif
!!! sanity check (disabled in production builds for performance) !!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) &
@@ -137,7 +136,6 @@
gont = 1 gont = 1
return return
endif endif
#endif
PI = dacos(-ONE) PI = dacos(-ONE)
@@ -168,8 +166,6 @@
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
@@ -188,7 +184,7 @@
gxy * betaxz + gyy * betayz + & gxy * betaxz + gyy * betayz + &
gxz * betaxy + gzz * betazy & gxz * betaxy + gzz * betazy &
- gyz * betaxx - gyz * betaxx
gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + & gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + &
gxx * betaxz + gxy * betayz + & gxx * betaxz + gxy * betayz + &
gyz * betayx + gzz * betazx & gyz * betayx + gzz * betazx &
@@ -203,8 +199,6 @@
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
!$OMP END WORKSHARE
!$OMP END PARALLEL
if(co == 0)then if(co == 0)then
! Gam^i_Res = Gam^i + gup^ij_,j ! Gam^i_Res = Gam^i + gup^ij_,j
@@ -238,8 +232,6 @@
endif endif
! second kind of connection ! second kind of connection
!$OMP PARALLEL
!$OMP WORKSHARE
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
@@ -288,8 +280,6 @@
(gupxy * gupyz + gupyy * gupxz)* Axy + & (gupxy * gupyz + gupyy * gupxz)* Axy + &
(gupxy * gupzz + gupyz * gupxz)* Axz + & (gupxy * gupzz + gupyz * gupxz)* Axz + &
(gupyy * gupzz + gupyz * gupyz)* Ayz (gupyy * gupzz + gupyz * gupyz)* Ayz
!$OMP END WORKSHARE
!$OMP END PARALLEL
! Right hand side for Gam^i without shift terms... ! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
@@ -344,8 +334,6 @@
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - & Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + & Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + & F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
@@ -385,8 +373,6 @@
gyyz = gxz * Gamxyy + gyz * Gamyyy + gzz * Gamzyy gyyz = gxz * Gamxyy + gyz * Gamyyy + gzz * Gamzyy
gyzz = gxz * Gamxyz + gyz * Gamyyz + gzz * Gamzyz gyzz = gxz * Gamxyz + gyz * Gamyyz + gzz * Gamzyz
gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz
!$OMP END WORKSHARE
!$OMP END PARALLEL
!compute Ricci tensor for tilted metric !compute Ricci tensor for tilted metric
call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev) call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev)
@@ -413,8 +399,6 @@
Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + & Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
!$OMP PARALLEL
!$OMP WORKSHARE
Rxx = - HALF * Rxx + & Rxx = - HALF * Rxx + &
gxx * Gamxx+ gxy * Gamyx + gxz * Gamzx + & gxx * Gamxx+ gxy * Gamyx + gxz * Gamzx + &
Gamxa * gxxx + Gamya * gxyx + Gamza * gxzx + & Gamxa * gxxx + Gamya * gxyx + Gamza * gxzx + &
@@ -615,13 +599,9 @@
Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + & Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + &
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + & Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz ) Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!$OMP END WORKSHARE
!$OMP END PARALLEL
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
@@ -644,15 +624,11 @@
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
!$OMP END WORKSHARE
!$OMP END PARALLEL
! covariant second derivatives of the lapse respect to physical metric ! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev) SYM,SYM,SYM,symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
@@ -813,8 +789,6 @@
!!!! gauge variable part !!!! gauge variable part
Lap_rhs = -TWO*alpn1*trK Lap_rhs = -TWO*alpn1*trK
!$OMP END WORKSHARE
!$OMP END PARALLEL
#if (GAUGE == 0) #if (GAUGE == 0)
betax_rhs = FF*dtSfx betax_rhs = FF*dtSfx
betay_rhs = FF*dtSfy betay_rhs = FF*dtSfy

View File

@@ -997,11 +997,11 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
#if 0
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
! x direction #if 0
! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1018,7 +1018,7 @@
! set imax and imin 0 ! set imax and imin 0
endif endif
! y direction ! y direction
if(j+2 <= jmax .and. j-2 >= jmin)then if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
@@ -1029,7 +1029,7 @@
! set jmax and jmin 0 ! set jmax and jmin 0
endif endif
! z direction ! z direction
if(k+2 <= kmax .and. k-2 >= kmin)then if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
@@ -1040,13 +1040,9 @@
! set kmax and kmin 0 ! set kmax and kmin 0
endif endif
enddo
enddo
enddo
#elif 0 #elif 0
do k=1,ex(3)-1 ! x direction
do j=1,ex(2)-1 if(i+2 <= imax .and. i-2 >= imin)then
do i=1,ex(1)-1
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
! fx(i) = --------------------------------------------- ! fx(i) = ---------------------------------------------
@@ -1083,32 +1079,8 @@
! set kmax and kmin 0 ! set kmax and kmin 0
endif endif
enddo
enddo
enddo
#else #else
! for bam comparison — split into branch-free interior + serial boundary ! for bam comparison
! Interior: all stencil points guaranteed in-bounds, no branches needed
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=max(3,1),min(ex(3)-1,kmax-2)
do j=max(3,1),min(ex(2)-1,jmax-2)
!DIR$ IVDEP
do i=max(3,1),min(ex(1)-1,imax-2)
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
enddo
enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i >= 3 .and. i <= imax-2 .and. &
j >= 3 .and. j <= jmax-2 .and. &
k >= 3 .and. k <= kmax-2) cycle
if(i+2 <= imax .and. i-2 >= imin .and. & if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. & j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then k+2 <= kmax .and. k-2 >= kmin) then
@@ -1122,10 +1094,10 @@
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k)) fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1)) fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif endif
enddo
enddo
enddo
#endif #endif
enddo
enddo
enddo
return return
@@ -1429,10 +1401,10 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
#if 0
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0
!~~~~~~ fxx !~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
@@ -1509,48 +1481,9 @@
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2))) - (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif endif
enddo
enddo
enddo
#else #else
! for bam comparison — split into branch-free interior + serial boundary ! for bam comparison
! Interior: all stencil points guaranteed in-bounds, no branches needed
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=max(3,1),min(ex(3)-1,kmax-2)
do j=max(3,1),min(ex(2)-1,jmax-2)
!DIR$ IVDEP
do i=max(3,1),min(ex(1)-1,imax-2)
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
enddo
enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i >= 3 .and. i <= imax-2 .and. &
j >= 3 .and. j <= jmax-2 .and. &
k >= 3 .and. k <= kmax-2) cycle
if(i+2 <= imax .and. i-2 >= imin .and. & if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. & j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then k+2 <= kmax .and. k-2 >= kmin) then
@@ -1585,10 +1518,10 @@
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1)) fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif endif
enddo
enddo
enddo
#endif #endif
enddo
enddo
enddo
return return

View File

@@ -17,62 +17,62 @@
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k integer :: i,j,k
real*8 :: lgxx,lgyy,lgzz,ldetg real*8 :: lgxx,lgyy,lgzz,ldetg
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA,lscale real*8 :: ltrA,lscale
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
lgxx = dxx(i,j,k) + ONE lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE lgzz = dzz(i,j,k) + ONE
ldetg = lgxx * lgyy * lgzz & ldetg = lgxx * lgyy * lgzz &
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) & + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
- gxz(i,j,k) * lgyy * gxz(i,j,k) & - gxz(i,j,k) * lgyy * gxz(i,j,k) &
- gxy(i,j,k) * gxy(i,j,k) * lgzz & - gxy(i,j,k) * gxy(i,j,k) * lgzz &
- lgxx * gyz(i,j,k) * gyz(i,j,k) - lgxx * gyz(i,j,k) * gyz(i,j,k)
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) & + lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k)) + lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
lscale = ONE / ( ldetg ** F1o3 ) lscale = ONE / ( ldetg ** F1o3 )
dxx(i,j,k) = lgxx * lscale - ONE dxx(i,j,k) = lgxx * lscale - ONE
gxy(i,j,k) = gxy(i,j,k) * lscale gxy(i,j,k) = gxy(i,j,k) * lscale
gxz(i,j,k) = gxz(i,j,k) * lscale gxz(i,j,k) = gxz(i,j,k) * lscale
dyy(i,j,k) = lgyy * lscale - ONE dyy(i,j,k) = lgyy * lscale - ONE
gyz(i,j,k) = gyz(i,j,k) * lscale gyz(i,j,k) = gyz(i,j,k) * lscale
dzz(i,j,k) = lgzz * lscale - ONE dzz(i,j,k) = lgzz * lscale - ONE
enddo enddo
enddo enddo
enddo enddo
return return
@@ -93,72 +93,72 @@
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k integer :: i,j,k
real*8 :: lgxx,lgyy,lgzz,lscale real*8 :: lgxx,lgyy,lgzz,lscale
real*8 :: lgxy,lgxz,lgyz real*8 :: lgxy,lgxz,lgyz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA real*8 :: ltrA
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
! for g: normalize determinant first ! for g: normalize determinant first
lgxx = dxx(i,j,k) + ONE lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE lgzz = dzz(i,j,k) + ONE
lgxy = gxy(i,j,k) lgxy = gxy(i,j,k)
lgxz = gxz(i,j,k) lgxz = gxz(i,j,k)
lgyz = gyz(i,j,k) lgyz = gyz(i,j,k)
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
lscale = ONE / ( lscale ** F1o3 ) lscale = ONE / ( lscale ** F1o3 )
lgxx = lgxx * lscale lgxx = lgxx * lscale
lgxy = lgxy * lscale lgxy = lgxy * lscale
lgxz = lgxz * lscale lgxz = lgxz * lscale
lgyy = lgyy * lscale lgyy = lgyy * lscale
lgyz = lgyz * lscale lgyz = lgyz * lscale
lgzz = lgzz * lscale lgzz = lgzz * lscale
dxx(i,j,k) = lgxx - ONE dxx(i,j,k) = lgxx - ONE
gxy(i,j,k) = lgxy gxy(i,j,k) = lgxy
gxz(i,j,k) = lgxz gxz(i,j,k) = lgxz
dyy(i,j,k) = lgyy - ONE dyy(i,j,k) = lgyy - ONE
gyz(i,j,k) = lgyz gyz(i,j,k) = lgyz
dzz(i,j,k) = lgzz - ONE dzz(i,j,k) = lgzz - ONE
! for A: trace-free using normalized metric (det=1, no division needed) ! for A: trace-free using normalized metric (det=1, no division needed)
lgupxx = ( lgyy * lgzz - lgyz * lgyz ) lgupxx = ( lgyy * lgzz - lgyz * lgyz )
lgupxy = - ( lgxy * lgzz - lgyz * lgxz ) lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
lgupxz = ( lgxy * lgyz - lgyy * lgxz ) lgupxz = ( lgxy * lgyz - lgyy * lgxz )
lgupyy = ( lgxx * lgzz - lgxz * lgxz ) lgupyy = ( lgxx * lgzz - lgxz * lgxz )
lgupyz = - ( lgxx * lgyz - lgxy * lgxz ) lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
lgupzz = ( lgxx * lgyy - lgxy * lgxy ) lgupzz = ( lgxx * lgyy - lgxy * lgxy )
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) & + lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k)) + lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
enddo enddo
enddo enddo
enddo enddo
return return

View File

@@ -324,7 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
enddo enddo
@@ -349,7 +349,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -377,7 +377,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -881,27 +881,22 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA real*8, dimension(1:3), intent(in) :: SoA
integer::i,j,k integer::i
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k) !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do k=1,extc(3) funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do j=1,extc(2) !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=1,extc(1) do i=0,ord-1
funcc(i,j,k) = func(i,j,k) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
enddo enddo
enddo !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
enddo do i=0,ord-1
!$OMP END PARALLEL DO funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo
do i=0,ord-1 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) do i=0,ord-1
enddo funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
do i=0,ord-1 enddo
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo
do i=0,ord-1
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
enddo
end subroutine symmetry_bd end subroutine symmetry_bd
@@ -917,7 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -945,7 +940,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -1116,162 +1111,355 @@ end subroutine d2dump
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
! common code for cell and vertex ! common code for cell and vertex
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
#ifndef POLINT6_USE_BARYCENTRIC
subroutine polint(xa, ya, x, y, dy, ordn) #define POLINT6_USE_BARYCENTRIC 1
implicit none #endif
integer, intent(in) :: ordn !DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
real*8, dimension(ordn), intent(in) :: xa, ya subroutine polint6_neville(xa, ya, x, y, dy)
real*8, intent(in) :: x implicit none
real*8, intent(out) :: y, dy
real*8, dimension(6), intent(in) :: xa, ya
integer :: i, m, ns, n_m real*8, intent(in) :: x
real*8, dimension(ordn) :: c, d, ho real*8, intent(out) :: y, dy
real*8 :: dif, dift, hp, h, den_val
integer :: i, m, ns, n_m
c = ya real*8, dimension(6) :: c, d, ho
d = ya real*8 :: dif, dift, hp, h, den_val
ho = xa - x
c = ya
ns = 1 d = ya
dif = abs(x - xa(1)) ho = xa - x
do i = 2, ordn ns = 1
dift = abs(x - xa(i)) dif = abs(x - xa(1))
if (dift < dif) then
ns = i do i = 2, 6
dif = dift dift = abs(x - xa(i))
end if if (dift < dif) then
end do ns = i
dif = dift
y = ya(ns) end if
ns = ns - 1 end do
do m = 1, ordn - 1 y = ya(ns)
n_m = ordn - m ns = ns - 1
do i = 1, n_m
hp = ho(i) do m = 1, 5
h = ho(i+m) n_m = 6 - m
den_val = hp - h do i = 1, n_m
hp = ho(i)
if (den_val == 0.0d0) then h = ho(i+m)
write(*,*) 'failure in polint for point',x den_val = hp - h
write(*,*) 'with input points: ',xa
stop if (den_val == 0.0d0) then
end if write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
den_val = (c(i+1) - d(i)) / den_val stop
end if
d(i) = h * den_val
c(i) = hp * den_val den_val = (c(i+1) - d(i)) / den_val
end do
d(i) = h * den_val
if (2 * ns < n_m) then c(i) = hp * den_val
dy = c(ns + 1) end do
else
dy = d(ns) if (2 * ns < n_m) then
ns = ns - 1 dy = c(ns + 1)
end if else
y = y + dy dy = d(ns)
end do ns = ns - 1
end if
return y = y + dy
end subroutine polint end do
!------------------------------------------------------------------------------
! return
! interpolation in 2 dimensions, follow yx order end subroutine polint6_neville
!
!------------------------------------------------------------------------------ !DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) subroutine polint6_barycentric(xa, ya, x, y, dy)
implicit none implicit none
integer,intent(in) :: ordn real*8, dimension(6), intent(in) :: xa, ya
real*8, dimension(1:ordn), intent(in) :: x1a,x2a real*8, intent(in) :: x
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya real*8, intent(out) :: y, dy
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy integer :: i, j
logical :: is_uniform
#ifdef POLINT_LEGACY_ORDER real*8, dimension(6) :: lambda
integer :: i,m real*8 :: dx, den_i, term, num, den, step, tol
real*8, dimension(ordn) :: ymtmp real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
real*8, dimension(ordn) :: yntmp
do i = 1, 6
m=size(x1a) if (x == xa(i)) then
do i=1,m y = ya(i)
yntmp=ya(i,:) dy = 0.d0
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) return
end do end if
call polint(x1a,ymtmp,x1,y,dy,ordn) end do
#else
integer :: j step = xa(2) - xa(1)
real*8, dimension(ordn) :: ymtmp is_uniform = (step /= 0.d0)
real*8 :: dy_temp if (is_uniform) then
tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
do j=1,ordn do i = 3, 6
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn) if (abs((xa(i) - xa(i-1)) - step) > tol) then
end do is_uniform = .false.
call polint(x2a, ymtmp, x2, y, dy, ordn) exit
#endif end if
end do
return end if
end subroutine polin2
!------------------------------------------------------------------------------ if (is_uniform) then
! num = 0.d0
! interpolation in 3 dimensions, follow zyx order den = 0.d0
! do i = 1, 6
!------------------------------------------------------------------------------ term = c_uniform(i) / (x - xa(i))
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) num = num + term * ya(i)
implicit none den = den + term
end do
integer,intent(in) :: ordn y = num / den
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a dy = 0.d0
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya return
real*8, intent(in) :: x1,x2,x3 end if
real*8, intent(out) :: y,dy
do i = 1, 6
#ifdef POLINT_LEGACY_ORDER den_i = 1.d0
integer :: i,j,m,n do j = 1, 6
real*8, dimension(ordn,ordn) :: yatmp if (j /= i) then
real*8, dimension(ordn) :: ymtmp dx = xa(i) - xa(j)
real*8, dimension(ordn) :: yntmp if (dx == 0.0d0) then
real*8, dimension(ordn) :: yqtmp write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
m=size(x1a) stop
n=size(x2a) end if
do i=1,m den_i = den_i * dx
do j=1,n end if
yqtmp=ya(i,j,:) end do
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) lambda(i) = 1.d0 / den_i
end do end do
yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) num = 0.d0
end do den = 0.d0
call polint(x1a,ymtmp,x1,y,dy,ordn) do i = 1, 6
#else term = lambda(i) / (x - xa(i))
integer :: j, k num = num + term * ya(i)
real*8, dimension(ordn,ordn) :: yatmp den = den + term
real*8, dimension(ordn) :: ymtmp end do
real*8 :: dy_temp
y = num / den
do k=1,ordn dy = 0.d0
do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn) return
end do end subroutine polint6_barycentric
end do
do k=1,ordn !DIR$ ATTRIBUTES FORCEINLINE :: polint
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn) subroutine polint(xa, ya, x, y, dy, ordn)
end do implicit none
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif integer, intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: xa, ya
return real*8, intent(in) :: x
end subroutine polin3 real*8, intent(out) :: y, dy
integer :: i, m, ns, n_m
real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val
if (ordn == 6) then
#if POLINT6_USE_BARYCENTRIC
call polint6_barycentric(xa, ya, x, y, dy)
#else
call polint6_neville(xa, ya, x, y, dy)
#endif
return
end if
c = ya
d = ya
ho = xa - x
ns = 1
dif = abs(x - xa(1))
do i = 2, ordn
dift = abs(x - xa(i))
if (dift < dif) then
ns = i
dif = dift
end if
end do
y = ya(ns)
ns = ns - 1
do m = 1, ordn - 1
n_m = ordn - m
do i = 1, n_m
hp = ho(i)
h = ho(i+m)
den_val = hp - h
if (den_val == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
den_val = (c(i+1) - d(i)) / den_val
d(i) = h * den_val
c(i) = hp * den_val
end do
if (2 * ns < n_m) then
dy = c(ns + 1)
else
dy = d(ns)
ns = ns - 1
end if
y = y + dy
end do
return
end subroutine polint
!------------------------------------------------------------------------------
! Compute Lagrange interpolation basis weights for one target point.
!------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
subroutine polint_lagrange_weights(xa, x, w, ordn)
implicit none
integer, intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: xa
real*8, intent(in) :: x
real*8, dimension(1:ordn), intent(out) :: w
integer :: i, j
real*8 :: num, den, dx
do i = 1, ordn
num = 1.d0
den = 1.d0
do j = 1, ordn
if (j /= i) then
dx = xa(i) - xa(j)
if (dx == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
num = num * (x - xa(j))
den = den * dx
end if
end do
w(i) = num / den
end do
return
end subroutine polint_lagrange_weights
!------------------------------------------------------------------------------
!
! interpolation in 2 dimensions, follow yx order
!
!------------------------------------------------------------------------------
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
implicit none
integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
integer :: i,m
real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp
m=size(x1a)
do i=1,m
yntmp=ya(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do j=1,ordn
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do
call polint(x2a, ymtmp, x2, y, dy, ordn)
#endif
return
end subroutine polin2
!------------------------------------------------------------------------------
!
! interpolation in 3 dimensions, follow zyx order
!
!------------------------------------------------------------------------------
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
implicit none
integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER
integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp
real*8, dimension(ordn) :: yqtmp
m=size(x1a)
n=size(x2a)
do i=1,m
do j=1,n
yqtmp=ya(i,j,:)
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
end do
yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: i, j, k
real*8, dimension(ordn) :: w1, w2
real*8, dimension(ordn) :: ymtmp
real*8 :: yx_sum, x_sum
call polint_lagrange_weights(x1a, x1, w1, ordn)
call polint_lagrange_weights(x2a, x2, w2, ordn)
do k = 1, ordn
yx_sum = 0.d0
do j = 1, ordn
x_sum = 0.d0
do i = 1, ordn
x_sum = x_sum + w1(i) * ya(i,j,k)
end do
yx_sum = yx_sum + w2(j) * x_sum
end do
ymtmp(k) = yx_sum
end do
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif
return
end subroutine polin3
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm ! calculate L2norm
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw) f,f_out,gw)
@@ -1288,9 +1476,9 @@ end subroutine d2dump
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k,n_elements
real*8, dimension(:), allocatable :: f_flat real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT real*8, external :: DDOT
dX = X(2) - X(1) dX = X(2) - X(1)
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
@@ -1314,20 +1502,91 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) allocate(f_flat(n_elements))
allocate(f_flat(n_elements)) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) deallocate(f_flat)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
return return
end subroutine l2normhelper end subroutine l2normhelper
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f1,f2,f3,f4,f5,f6,f7,f_out,gw)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw,ogw,Symmetry) f,f_out,gw,ogw,Symmetry)
@@ -1344,9 +1603,9 @@ f_out = f_out*dX*dY*dZ
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k,n_elements
real*8, dimension(:), allocatable :: f_flat real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1409,12 +1668,11 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) allocate(f_flat(n_elements))
allocate(f_flat(n_elements)) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) deallocate(f_flat)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1441,9 +1699,9 @@ f_out = f_out*dX*dY*dZ
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k integer::i,j,k
real*8, dimension(:), allocatable :: f_flat real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1506,12 +1764,11 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) allocate(f_flat(Nout))
allocate(f_flat(Nout)) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout]) f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
f_out = DDOT(Nout, f_flat, 1, f_flat, 1) deallocate(f_flat)
deallocate(f_flat)
return return
@@ -1612,9 +1869,12 @@ deallocate(f_flat)
! ^ ! ^
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0 real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
integer :: i,j,k
fout = C1*f1+C2*f2+C3*f3
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
end do
return return
@@ -1708,8 +1968,8 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN,ORDN) :: ya real*8, dimension(ORDN,ORDN,ORDN) :: ya
real*8, dimension(ORDN,ORDN) :: tmp2 real*8, dimension(ORDN,ORDN) :: tmp2
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(3) :: SoAh real*8, dimension(3) :: SoAh
real*8, external :: DDOT real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds+1 cxB = inds+1
@@ -1745,21 +2005,17 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
endif endif
! Optimized with BLAS operations for better performance
! First dimension: z-direction weighted sum
tmp2=0 tmp2=0
do m=1,ORDN do m=1,ORDN
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m) tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
enddo enddo
! Second dimension: y-direction weighted sum
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m) tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo enddo
! Third dimension: x-direction weighted sum using BLAS DDOT f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
return return
@@ -1788,8 +2044,8 @@ deallocate(f_flat)
integer,dimension(2) :: cxB,cxT integer,dimension(2) :: cxB,cxT
real*8, dimension(ORDN,ORDN) :: ya real*8, dimension(ORDN,ORDN) :: ya
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(2) :: SoAh real*8, dimension(2) :: SoAh
real*8, external :: DDOT real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds(1:2)+1 cxB = inds(1:2)+1
@@ -1819,14 +2075,12 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
endif endif
! Optimized with BLAS operations
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m) tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo enddo
! Use BLAS DDOT for final weighted sum f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
return return
@@ -1852,12 +2106,12 @@ deallocate(f_flat)
!~~~~~~> Other parameters: !~~~~~~> Other parameters:
real*8, dimension(-ORDN+1:ex(1)+ORDN,-ORDN+1:ex(2)+ORDN,ex(3)) :: fh real*8, dimension(-ORDN+1:ex(1)+ORDN,-ORDN+1:ex(2)+ORDN,ex(3)) :: fh
integer :: m integer :: m
integer :: cxB,cxT integer :: cxB,cxT
real*8, dimension(ORDN) :: ya real*8, dimension(ORDN) :: ya
real*8 :: SoAh real*8 :: SoAh
integer,dimension(3) :: inds integer,dimension(3) :: inds
real*8, external :: DDOT real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
inds = indsi + 1 inds = indsi + 1
@@ -1918,8 +2172,7 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif endif
! Optimized with BLAS DDOT for weighted sum f_int = DDOT(ORDN, coef, 1, ya, 1)
f_int = DDOT(ORDN, coef, 1, ya, 1)
return return
@@ -2151,38 +2404,32 @@ deallocate(f_flat)
end function fWigner_d_function end function fWigner_d_function
!---------------------------------- !----------------------------------
! Optimized factorial function using lookup table for small N
! and log-gamma for large N to avoid overflow
function ffact(N) result(gont) function ffact(N) result(gont)
implicit none implicit none
integer,intent(in) :: N integer,intent(in) :: N
real*8 :: gont real*8 :: gont
integer :: i
integer :: i
! Lookup table for factorials 0! to 20! (precomputed) real*8, parameter, dimension(0:20) :: fact_table = [ &
real*8, parameter, dimension(0:20) :: fact_table = [ & 1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, & 362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, & 87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
87178291200.d0, 1307674368000.d0, 20922789888000.d0, & 355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, & 2432902008176640000.d0 ]
2432902008176640000.d0 ]
! sanity check ! sanity check
if(N < 0)then if(N < 0)then
write(*,*) "ffact: error input for factorial" write(*,*) "ffact: error input for factorial"
gont = 1.d0 gont = 1.d0
return return
endif endif
! Use lookup table for small N (fast path) if(N <= 20)then
if(N <= 20)then gont = fact_table(N)
gont = fact_table(N) else
else gont = exp(log_gamma(dble(N+1)))
! Use log-gamma function for large N: N! = exp(log_gamma(N+1)) endif
! This avoids overflow and is computed efficiently
gont = exp(log_gamma(dble(N+1)))
endif
return return

View File

@@ -12,9 +12,10 @@
#define f_global_interpind global_interpind #define f_global_interpind global_interpind
#define f_global_interpind2d global_interpind2d #define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d #define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper #define f_l2normhelper l2normhelper
#define f_l2normhelper_sh l2normhelper_sh #define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average #define f_average average
#define f_average3 average3 #define f_average3 average3
#define f_average2 average2 #define f_average2 average2
@@ -41,9 +42,10 @@
#define f_global_interpind GLOBAL_INTERPIND #define f_global_interpind GLOBAL_INTERPIND
#define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D #define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER #define f_l2normhelper L2NORMHELPER
#define f_l2normhelper_sh L2NORMHELPER_SH #define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE #define f_average AVERAGE
#define f_average3 AVERAGE3 #define f_average3 AVERAGE3
#define f_average2 AVERAGE2 #define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
#define f_global_interpind global_interpind_ #define f_global_interpind global_interpind_
#define f_global_interpind2d global_interpind2d_ #define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_ #define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_ #define f_l2normhelper l2normhelper_
#define f_l2normhelper_sh l2normhelper_sh_ #define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_ #define f_average average_
#define f_average3 average3_ #define f_average3 average3_
#define f_average2 average2_ #define f_average2 average2_
@@ -156,21 +159,30 @@ extern "C"
int *, double *, int &, int &); int *, double *, int &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper(int *, double *, double *, double *, void f_l2normhelper(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &); double *, double &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper_sh(int *, double *, double *, double *, void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &, int &, int &); double *, double *, double *, double *,
} double *, double *, double *, double *, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &, int &, int &);
}
extern "C" extern "C"
{ {

View File

@@ -16,66 +16,115 @@ using namespace std;
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#endif #endif
/* Linear equation solution by Gauss-Jordan elimination.
// Intel oneMKL LAPACK interface
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. corresponding set of solution vectors */
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
int gaussj(double *a, double *b, int n) int gaussj(double *a, double *b, int n)
{ {
// Allocate pivot array and workspace double swap;
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
// Make a copy of matrix a for solving (dgesv modifies it to LU form) int *indxc, *indxr, *ipiv;
double *a_copy = new double[n * n]; indxc = new int[n];
for (int i = 0; i < n * n; i++) { indxr = new int[n];
a_copy[i] = a[i]; ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv, temp;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
}
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
} }
// Step 1: Solve linear system A*x = b using LU decomposition for (l = n - 1; l >= 0; l--)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
if (info != 0) { {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; swap = a[k * n + indxr[l]];
delete[] ipiv; a[k * n + indxr[l]] = a[k * n + indxc[l]];
delete[] a_copy; a[k * n + indxc[l]] = swap;
return 1; }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
} }
delete[] indxc;
delete[] indxr;
delete[] ipiv; delete[] ipiv;
delete[] a_copy;
return 0; return 0;
} }

View File

@@ -512,10 +512,11 @@
IMPLICIT DOUBLE PRECISION (A-H,O-Z) IMPLICIT DOUBLE PRECISION (A-H,O-Z)
DIMENSION V(N),W(N) DIMENSION V(N),W(N)
! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT. ! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
! Optimized using Intel oneMKL BLAS ddot
! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
DOUBLE PRECISION, EXTERNAL :: DDOT SUM = 0.0D0
DGVV = DDOT(N, V, 1, W, 1) DO 10 I = 1,N
SUM = SUM + V(I)*W(I)
10 CONTINUE
DGVV = SUM
RETURN RETURN
END END

View File

@@ -159,42 +159,36 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
! Interior: all stencil points guaranteed in-bounds
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=4,ex(3)-3
do j=4,ex(2)-3
!DIR$ IVDEP
do i=4,ex(1)-3
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
enddo
enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
if(i >= 4 .and. i <= ex(1)-3 .and. &
j >= 4 .and. j <= ex(2)-3 .and. &
k >= 4 .and. k <= ex(3)-3) cycle
if(i-3 >= imin .and. i+3 <= imax .and. & if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. & j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then k-3 >= kmin .and. k+3 <= kmax) then
#if 0
! x direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dX/cof * ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )
! y direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dY/cof * ( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )
! z direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dZ/cof * ( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )
#else
! calculation order if important ?
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( & f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - & (fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + & SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
@@ -210,7 +204,9 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + & SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - & FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ ) TWT* fh(i,j,k) )/dZ )
#endif
endif endif
enddo enddo
enddo enddo
enddo enddo

View File

@@ -231,13 +231,12 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
! upper bound set ex-1 only for efficiency, ! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also ! the loop body will set ex 0 also
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0 #if 0
!! old code !! old code
! x direction ! x direction
if(Sfx(i,j,k) >= ZEO .and. i+3 <= imax .and. i-1 >= imin)then if(Sfx(i,j,k) >= ZEO .and. i+3 <= imax .and. i-1 >= imin)then
@@ -483,7 +482,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
enddo enddo
enddo enddo
enddo enddo
!$OMP END PARALLEL DO
return return

View File

@@ -2,7 +2,7 @@
#ifndef MICRODEF_H #ifndef MICRODEF_H
#define MICRODEF_H #define MICRODEF_H
#include "macrodef.fh" #include "macrodef.fh"
// application parameters // application parameters

View File

@@ -1,11 +1,25 @@
include makefile.inc include makefile.inc
.SUFFIXES: .o .f90 .C .for .cu ## polint(ordn=6) kernel selector:
## 1 (default): barycentric fast path
.f90.o: ## 0 : fallback to Neville path
$(f90) $(f90appflags) -c $< -o $@ POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
ARCH_OPT = -march=x86-64-v4
CXXAPPFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
TP_OPTFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include
.SUFFIXES: .o .f90 .C .for .cu
.f90.o:
$(f90) $(f90appflags) -c $< -o $@
.C.o: .C.o:
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -13,8 +27,14 @@ include makefile.inc
.for.o: .for.o:
$(f77) -c $< -o $@ $(f77) -c $< -o $@
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
@@ -95,8 +115,8 @@ ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -1,30 +1,32 @@
## GCC version (commented out) ## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran ## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## Intel oneAPI version with oneMKL
filein = -I/usr/include/ -I${MKLROOT}/include filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Use sequential oneMKL to avoid introducing extra OpenMP behavior into ABE.
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
## Optional Intel oneTBB allocator, kept aligned with main's build environment.
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## Aggressive optimization flags:
## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx
CC = icx CC = icx
CLINKER = mpiicpx CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -11,17 +11,6 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 0-111"
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104
################################################################## ##################################################################
@@ -37,11 +26,11 @@ def makefile_ABE():
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" makefile_command = "make -j96" + " ABE"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" makefile_command = "make -j4" + " ABEGPU"
else: else:
print( " CPU/GPU numerical calculation setting is wrong " ) print( " CPU/GPU numerical calculation setting is wrong " )
print( ) print( )
@@ -78,8 +67,8 @@ def makefile_TwoPunctureABE():
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" makefile_command = "make" + " TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
@@ -116,10 +105,10 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
@@ -158,7 +147,7 @@ def run_TwoPunctureABE():
print( ) print( )
## Define the command to run ## Define the command to run
TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" TwoPuncture_command = "./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log" TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output

12
parallel_plot_helper.py Normal file
View File

@@ -0,0 +1,12 @@
import multiprocessing
def run_plot_task(task):
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

View File

@@ -8,11 +8,13 @@
## ##
################################################# #################################################
import numpy ## numpy for array operations import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing import scipy ## scipy for interpolation and signal processing
import math import math
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib
import os ## os for system/file operations matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data

View File

@@ -6,17 +6,22 @@
## Author: Xiaoqu ## Author: Xiaoqu
## Dates: 2024/10/01 --- 2025/09/14 ## Dates: 2024/10/01 --- 2025/09/14
## ##
################################################# #################################################
import numpy ## Restrict OpenMP to one thread per process so that parallel
import scipy ## subprocess plotting does not multiply BLAS thread counts.
import matplotlib.pyplot as plt import os
from matplotlib.colors import LogNorm os.environ.setdefault("OMP_NUM_THREADS", "1")
from mpl_toolkits.mplot3d import Axes3D
## import torch import numpy
import AMSS_NCKU_Input as input_data import scipy
import matplotlib
import os matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D
## import torch
import AMSS_NCKU_Input as input_data
######################################################################################### #########################################################################################
@@ -92,9 +97,9 @@ def plot_binary_data( filename, binary_outdir, figure_outdir ):
#################################################################################### ####################################################################################
# Plot a single binary dataset (2D slices and 3D surface) # Plot a single binary dataset (2D slices and 3D surface)
def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ): def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
@@ -188,7 +193,15 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
plt.savefig( os.path.join(figure_surfaceplot_outdir, figure_title + " time = " + str(time) + " surface_plot.pdf") ) # save figure plt.savefig( os.path.join(figure_surfaceplot_outdir, figure_title + " time = " + str(time) + " surface_plot.pdf") ) # save figure
plt.close() plt.close()
return return
#################################################################################### ####################################################################################
## Allow standalone subprocess execution for parallel binary-data plotting.
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -6,15 +6,20 @@
## 2024/10/01 --- 2025/09/14 ## 2024/10/01 --- 2025/09/14
## ##
################################################# #################################################
import numpy ## numpy for array operations import numpy ## numpy for array operations
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import glob import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## operating system utilities from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob
import plot_binary_data import os ## operating system utilities
import AMSS_NCKU_Input as input_data
import plot_binary_data
import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots # plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -50,13 +55,37 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x) file_list.append(x)
print(x) print(x)
## Plot each file in the list ## Plot each file in parallel using subprocesses.
for filename in file_list: ## Each subprocess starts with BLAS thread limits in plot_binary_data.py.
print(filename) script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir) max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
print( ) running = []
print( " Binary Data Plot Has been Finished " ) failed = []
for filename in file_list:
print(filename)
proc = subprocess.Popen(
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
print( )
print( " Binary Data Plot Has been Finished " )
print( ) print( )
return return