Compare commits

..

1 Commits

Author SHA1 Message Date
CGH0S7
79af79d471 baseline updated 2026-02-05 19:53:55 +08:00
24 changed files with 695 additions and 2451 deletions

4
.gitignore vendored
View File

@@ -1,6 +1,2 @@
__pycache__ __pycache__
GW150914 GW150914
GW150914-origin
docs
*.tmp

View File

@@ -8,14 +8,6 @@
## ##
################################################################## ##################################################################
## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script, spawning exponentially more
## workers (fork bomb).
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
################################################################## ##################################################################
@@ -432,31 +424,26 @@ print(
import plot_xiaoqu import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory ## Plot black hole trajectory
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time ## Plot black hole separation vs. time
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude) ## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) ) plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution ## Plot ADM mass evolution
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time ## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level): for i in range(input_data.grid_level):
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
run_plot_tasks_parallel(plot_tasks)
## Plot stored binary data ## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory ) plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env python3
"""
AMSS-NCKU GW150914 Simulation Regression Test Script
Verification Requirements:
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
2. ADM constraint violation < 2 (Grid Level 0)
RMS Calculation Method:
- Computes trajectory deviation on the XY plane independently for BH1 and BH2
- For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
- Final RMS = max(RMS_BH1, RMS_BH2)
Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
Default: output_dir = GW150914/AMSS_NCKU_output
Reference: GW150914-origin (baseline simulation)
"""
import numpy as np
import sys
import os
# ANSI Color Codes
class Color:
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
BOLD = '\033[1m'
RESET = '\033[0m'
def get_status_text(passed):
if passed:
return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
else:
return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
def load_bh_trajectory(filepath):
"""Load black hole trajectory data"""
data = np.loadtxt(filepath)
return {
'time': data[:, 0],
'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
}
def load_constraint_data(filepath):
"""Load constraint violation data"""
data = []
with open(filepath, 'r') as f:
for line in f:
if line.startswith('#'):
continue
parts = line.split()
if len(parts) >= 8:
data.append([float(x) for x in parts[:8]])
return np.array(data)
def calculate_rms_error(bh_data_ref, bh_data_target):
"""
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
This function computes the RMS error independently for BH1 and BH2 trajectories,
then returns the maximum of the two as the final RMS error metric.
For each black hole, the RMS is calculated as:
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
where:
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
Args:
bh_data_ref: Reference (baseline) trajectory data
bh_data_target: Target (optimized) trajectory data
Returns:
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
error: Error message if any
"""
# Align data: truncate to the length of the shorter dataset
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
if M < 10:
return None, "Insufficient data points for comparison"
# Extract XY coordinates for both black holes
x1_ref = bh_data_ref['x1'][:M]
y1_ref = bh_data_ref['y1'][:M]
x2_ref = bh_data_ref['x2'][:M]
y2_ref = bh_data_ref['y2'][:M]
x1_new = bh_data_target['x1'][:M]
y1_new = bh_data_target['y1'][:M]
x2_new = bh_data_target['x2'][:M]
y2_new = bh_data_target['y2'][:M]
# Calculate RMS for BH1
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
r1_new = np.sqrt(x1_new**2 + y1_new**2)
r1_max = np.maximum(r1_ref, r1_new)
# Calculate RMS for BH2
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
r2_new = np.sqrt(x2_new**2 + y2_new**2)
r2_max = np.maximum(r2_ref, r2_new)
# Avoid division by zero for BH1
valid_mask1 = r1_max > 1e-15
if np.sum(valid_mask1) < 10:
return None, "Insufficient valid data points for BH1"
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
# Avoid division by zero for BH2
valid_mask2 = r2_max > 1e-15
if np.sum(valid_mask2) < 10:
return None, "Insufficient valid data points for BH2"
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
# Final RMS is the maximum of BH1 and BH2
rms_final = max(rms_bh1, rms_bh2)
return rms_final, None
def analyze_constraint_violation(constraint_data, n_levels=9):
"""
Analyze ADM constraint violation
Return maximum constraint violation for Grid Level 0
"""
# Extract Grid Level 0 data (first entry for each time step)
level0_data = constraint_data[::n_levels]
# Calculate maximum absolute value for each constraint
results = {
'Ham': np.max(np.abs(level0_data[:, 1])),
'Px': np.max(np.abs(level0_data[:, 2])),
'Py': np.max(np.abs(level0_data[:, 3])),
'Pz': np.max(np.abs(level0_data[:, 4])),
'Gx': np.max(np.abs(level0_data[:, 5])),
'Gy': np.max(np.abs(level0_data[:, 6])),
'Gz': np.max(np.abs(level0_data[:, 7]))
}
results['max_violation'] = max(results.values())
return results
def print_header():
"""Print report header"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
def print_rms_results(rms_rel, error, threshold=1.0):
"""Print RMS error results"""
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
print("-" * 45)
if error:
print(f" {Color.RED}Error: {error}{Color.RESET}")
return False
passed = rms_rel < threshold
print(f" RMS relative error: {rms_rel:.4f}%")
print(f" Requirement: < {threshold}%")
print(f" Status: {get_status_text(passed)}")
return passed
def print_constraint_results(results, threshold=2.0):
"""Print constraint violation results"""
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
print("-" * 45)
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
for i, name in enumerate(names):
print(f" Max |{name:3}|: {results[name]:.6f}", end=" ")
if (i + 1) % 2 == 0: print()
if len(names) % 2 != 0: print()
passed = results['max_violation'] < threshold
print(f"\n Maximum violation: {results['max_violation']:.6f}")
print(f" Requirement: < {threshold}")
print(f" Status: {get_status_text(passed)}")
return passed
def print_summary(rms_passed, constraint_passed):
"""Print summary"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + "Verification Summary" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
all_passed = rms_passed and constraint_passed
res_rms = get_status_text(rms_passed)
res_con = get_status_text(constraint_passed)
print(f" [1] RMS trajectory check: {res_rms}")
print(f" [2] ADM constraint check: {res_con}")
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
print(f"\n Overall result: {final_status}")
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
return all_passed
def main():
# Determine target (optimized) output directory
if len(sys.argv) > 1:
target_dir = sys.argv[1]
else:
script_dir = os.path.dirname(os.path.abspath(__file__))
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
# Determine reference (baseline) directory
script_dir = os.path.dirname(os.path.abspath(__file__))
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
# Data file paths
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
# Check if files exist
if not os.path.exists(bh_file_ref):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
sys.exit(1)
if not os.path.exists(bh_file_target):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
sys.exit(1)
if not os.path.exists(constraint_file):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
sys.exit(1)
# Print header
print_header()
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
# Load data
bh_data_ref = load_bh_trajectory(bh_file_ref)
bh_data_target = load_bh_trajectory(bh_file_target)
constraint_data = load_constraint_data(constraint_file)
# Calculate RMS error
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
rms_passed = print_rms_results(rms_rel, error)
# Analyze constraint violation
constraint_results = analyze_constraint_violation(constraint_data)
constraint_passed = print_constraint_results(constraint_results)
# Print summary
all_passed = print_summary(rms_passed, constraint_passed)
# Return exit code
sys.exit(0 if all_passed else 1)
if __name__ == "__main__":
main()

View File

@@ -37,51 +37,57 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!------------- !-------------
SUBROUTINE four1(dataa,nn,isign) SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none implicit none
INTEGER, intent(in) :: isign, nn INTEGER::isign,nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
type(DFTI_DESCRIPTOR), pointer :: desc double precision::tempi,tempr
integer :: status DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
! Create DFTI descriptor for 1D complex-to-complex transform j=1
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) do i=1,n,2
if (status /= 0) return if(j.gt.i)then
tempr=dataa(j)
! Set input/output storage as interleaved complex (default) tempi=dataa(j+1)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) dataa(j)=dataa(i)
if (status /= 0) then dataa(j+1)=dataa(i+1)
status = DftiFreeDescriptor(desc) dataa(i)=tempr
return dataa(i+1)=tempi
endif
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return return
END SUBROUTINE four1 END SUBROUTINE four1

View File

@@ -3756,358 +3756,6 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
delete[] transfer_src; delete[] transfer_src;
delete[] transfer_dst; delete[] transfer_dst;
} }
//
// Async Sync: split into SyncBegin (initiate MPI) and SyncEnd (wait + unpack)
// This allows overlapping MPI communication with computation.
//
static void transfer_begin(Parallel::TransferState *ts)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = ts->cpusize;
ts->reqs = new MPI_Request[2 * cpusize];
ts->stats = new MPI_Status[2 * cpusize];
ts->req_no = 0;
ts->send_data = new double *[cpusize];
ts->rec_data = new double *[cpusize];
int length;
for (int node = 0; node < cpusize; node++)
{
ts->send_data[node] = ts->rec_data[node] = 0;
if (node == myrank)
{
// Local copy: pack then immediately unpack (no MPI needed)
if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
{
double *local_data = new double[length];
if (!local_data)
{
cout << "out of memory in transfer_begin, local copy" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
Parallel::data_packer(local_data, ts->transfer_src[myrank], ts->transfer_dst[myrank],
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
Parallel::data_packer(local_data, ts->transfer_src[node], ts->transfer_dst[node],
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
delete[] local_data;
}
}
else
{
// send from this cpu to cpu#node
if ((length = Parallel::data_packer(0, ts->transfer_src[myrank], ts->transfer_dst[myrank],
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
{
ts->send_data[node] = new double[length];
if (!ts->send_data[node])
{
cout << "out of memory in transfer_begin, send" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
Parallel::data_packer(ts->send_data[node], ts->transfer_src[myrank], ts->transfer_dst[myrank],
node, PACK, ts->VarList1, ts->VarList2, ts->Symmetry);
MPI_Isend((void *)ts->send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
ts->reqs + ts->req_no++);
}
// receive from cpu#node to this cpu
if ((length = Parallel::data_packer(0, ts->transfer_src[node], ts->transfer_dst[node],
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry)))
{
ts->rec_data[node] = new double[length];
if (!ts->rec_data[node])
{
cout << "out of memory in transfer_begin, recv" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)ts->rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD,
ts->reqs + ts->req_no++);
}
}
}
// NOTE: MPI_Waitall is NOT called here - that happens in transfer_end
}
//
static void transfer_end(Parallel::TransferState *ts)
{
// Wait for all pending MPI operations
MPI_Waitall(ts->req_no, ts->reqs, ts->stats);
// Unpack received data from remote ranks
for (int node = 0; node < ts->cpusize; node++)
if (ts->rec_data[node])
Parallel::data_packer(ts->rec_data[node], ts->transfer_src[node], ts->transfer_dst[node],
node, UNPACK, ts->VarList1, ts->VarList2, ts->Symmetry);
// Cleanup MPI buffers
for (int node = 0; node < ts->cpusize; node++)
{
if (ts->send_data[node])
delete[] ts->send_data[node];
if (ts->rec_data[node])
delete[] ts->rec_data[node];
}
delete[] ts->reqs;
delete[] ts->stats;
delete[] ts->send_data;
delete[] ts->rec_data;
}
//
Parallel::SyncHandle *Parallel::SyncBegin(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
SyncHandle *handle = new SyncHandle;
handle->num_states = 1;
handle->states = new TransferState[1];
TransferState *ts = &handle->states[0];
ts->cpusize = cpusize;
ts->VarList1 = VarList;
ts->VarList2 = VarList;
ts->Symmetry = Symmetry;
ts->owns_gsl = true;
ts->dst = build_ghost_gsl(Pat);
ts->src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
{
ts->src[node] = build_owned_gsl0(Pat, node);
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
}
transfer_begin(ts);
return handle;
}
//
Parallel::SyncHandle *Parallel::SyncBegin(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
// Count patches
int num_patches = 0;
MyList<Patch> *Pp = PatL;
while (Pp) { num_patches++; Pp = Pp->next; }
SyncHandle *handle = new SyncHandle;
handle->num_states = num_patches + 1; // intra-patch transfers + 1 inter-patch transfer
handle->states = new TransferState[handle->num_states];
// Intra-patch sync: for each patch, build ghost lists and initiate transfer
int idx = 0;
Pp = PatL;
while (Pp)
{
TransferState *ts = &handle->states[idx];
ts->cpusize = cpusize;
ts->VarList1 = VarList;
ts->VarList2 = VarList;
ts->Symmetry = Symmetry;
ts->owns_gsl = true;
ts->dst = build_ghost_gsl(Pp->data);
ts->src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
{
ts->src[node] = build_owned_gsl0(Pp->data, node);
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
}
transfer_begin(ts);
idx++;
Pp = Pp->next;
}
// Inter-patch sync: buffer zone exchange between patches
{
TransferState *ts = &handle->states[idx];
ts->cpusize = cpusize;
ts->VarList1 = VarList;
ts->VarList2 = VarList;
ts->Symmetry = Symmetry;
ts->owns_gsl = true;
ts->dst = build_buffer_gsl(PatL);
ts->src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
ts->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
{
ts->src[node] = build_owned_gsl(PatL, node, 5, Symmetry);
build_gstl(ts->src[node], ts->dst, &ts->transfer_src[node], &ts->transfer_dst[node]);
}
transfer_begin(ts);
}
return handle;
}
//
void Parallel::SyncEnd(SyncHandle *handle)
{
if (!handle)
return;
// Wait for all pending transfers and unpack
for (int i = 0; i < handle->num_states; i++)
{
TransferState *ts = &handle->states[i];
transfer_end(ts);
// Cleanup grid segment lists only if this state owns them
if (ts->owns_gsl)
{
if (ts->dst)
ts->dst->destroyList();
for (int node = 0; node < ts->cpusize; node++)
{
if (ts->src[node])
ts->src[node]->destroyList();
if (ts->transfer_src[node])
ts->transfer_src[node]->destroyList();
if (ts->transfer_dst[node])
ts->transfer_dst[node]->destroyList();
}
delete[] ts->src;
delete[] ts->transfer_src;
delete[] ts->transfer_dst;
}
}
delete[] handle->states;
delete handle;
}
//
// SyncPreparePlan: Pre-build grid segment lists for a patch list.
// The plan can be reused across multiple SyncBeginWithPlan calls
// as long as the mesh topology does not change (no regridding).
//
Parallel::SyncPlan *Parallel::SyncPreparePlan(MyList<Patch> *PatL, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
// Count patches
int num_patches = 0;
MyList<Patch> *Pp = PatL;
while (Pp) { num_patches++; Pp = Pp->next; }
SyncPlan *plan = new SyncPlan;
plan->num_entries = num_patches + 1; // intra-patch + 1 inter-patch
plan->Symmetry = Symmetry;
plan->entries = new SyncPlanEntry[plan->num_entries];
// Intra-patch entries: ghost zone exchange within each patch
int idx = 0;
Pp = PatL;
while (Pp)
{
SyncPlanEntry *pe = &plan->entries[idx];
pe->cpusize = cpusize;
pe->dst = build_ghost_gsl(Pp->data);
pe->src = new MyList<Parallel::gridseg> *[cpusize];
pe->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
pe->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
{
pe->src[node] = build_owned_gsl0(Pp->data, node);
build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]);
}
idx++;
Pp = Pp->next;
}
// Inter-patch entry: buffer zone exchange between patches
{
SyncPlanEntry *pe = &plan->entries[idx];
pe->cpusize = cpusize;
pe->dst = build_buffer_gsl(PatL);
pe->src = new MyList<Parallel::gridseg> *[cpusize];
pe->transfer_src = new MyList<Parallel::gridseg> *[cpusize];
pe->transfer_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
{
pe->src[node] = build_owned_gsl(PatL, node, 5, Symmetry);
build_gstl(pe->src[node], pe->dst, &pe->transfer_src[node], &pe->transfer_dst[node]);
}
}
return plan;
}
//
void Parallel::SyncFreePlan(SyncPlan *plan)
{
if (!plan)
return;
for (int i = 0; i < plan->num_entries; i++)
{
SyncPlanEntry *pe = &plan->entries[i];
if (pe->dst)
pe->dst->destroyList();
for (int node = 0; node < pe->cpusize; node++)
{
if (pe->src[node])
pe->src[node]->destroyList();
if (pe->transfer_src[node])
pe->transfer_src[node]->destroyList();
if (pe->transfer_dst[node])
pe->transfer_dst[node]->destroyList();
}
delete[] pe->src;
delete[] pe->transfer_src;
delete[] pe->transfer_dst;
}
delete[] plan->entries;
delete plan;
}
//
// SyncBeginWithPlan: Use pre-built GSLs from a SyncPlan to initiate async transfer.
// This avoids the O(cpusize * blocks^2) cost of rebuilding GSLs on every call.
//
Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList)
{
return SyncBeginWithPlan(plan, VarList, VarList);
}
//
Parallel::SyncHandle *Parallel::SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList1, MyList<var> *VarList2)
{
SyncHandle *handle = new SyncHandle;
handle->num_states = plan->num_entries;
handle->states = new TransferState[handle->num_states];
for (int i = 0; i < plan->num_entries; i++)
{
SyncPlanEntry *pe = &plan->entries[i];
TransferState *ts = &handle->states[i];
ts->cpusize = pe->cpusize;
ts->VarList1 = VarList1;
ts->VarList2 = VarList2;
ts->Symmetry = plan->Symmetry;
ts->owns_gsl = false; // GSLs are owned by the plan, not this handle
// Borrow GSL pointers from the plan (do NOT free them in SyncEnd)
ts->transfer_src = pe->transfer_src;
ts->transfer_dst = pe->transfer_dst;
ts->src = pe->src;
ts->dst = pe->dst;
transfer_begin(ts);
}
return handle;
}
// collect buffer grid segments or blocks for the periodic boundary condition of given patch // collect buffer grid segments or blocks for the periodic boundary condition of given patch
// --------------------------------------------------- // ---------------------------------------------------
// |con | |con | // |con | |con |

View File

@@ -81,53 +81,6 @@ namespace Parallel
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
// Async Sync: overlap MPI communication with computation
struct TransferState
{
MPI_Request *reqs;
MPI_Status *stats;
int req_no;
double **send_data;
double **rec_data;
int cpusize;
MyList<gridseg> **transfer_src;
MyList<gridseg> **transfer_dst;
MyList<gridseg> **src;
MyList<gridseg> *dst;
MyList<var> *VarList1;
MyList<var> *VarList2;
int Symmetry;
bool owns_gsl; // true if this state owns and should free the GSLs
};
struct SyncHandle
{
TransferState *states;
int num_states;
};
SyncHandle *SyncBegin(Patch *Pat, MyList<var> *VarList, int Symmetry);
SyncHandle *SyncBegin(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void SyncEnd(SyncHandle *handle);
// Cached GSL plan: pre-build grid segment lists once, reuse across multiple Sync calls
struct SyncPlanEntry
{
int cpusize;
MyList<gridseg> **transfer_src;
MyList<gridseg> **transfer_dst;
MyList<gridseg> **src;
MyList<gridseg> *dst;
};
struct SyncPlan
{
SyncPlanEntry *entries;
int num_entries;
int Symmetry;
};
SyncPlan *SyncPreparePlan(MyList<Patch> *PatL, int Symmetry);
void SyncFreePlan(SyncPlan *plan);
SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList);
SyncHandle *SyncBeginWithPlan(SyncPlan *plan, MyList<var> *VarList1, MyList<var> *VarList2);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,7 @@
#ifndef TWO_PUNCTURES_H #ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H #define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19 #define StencilSize 19
#define N_PlaneRelax 1 #define N_PlaneRelax 1
#define NRELAX 200 #define NRELAX 200
@@ -43,18 +42,6 @@ private:
int ntotal; int ntotal;
// ===== Precomputed spectral derivative matrices =====
double *D1_A, *D2_A;
double *D1_B, *D2_B;
double *DF1_phi, *DF2_phi;
// ===== Pre-allocated workspace for LineRelax (per-thread) =====
int max_threads;
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
struct parameters struct parameters
{ {
int nvar, n1, n2, n3; int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
int Newtonmaxit); int Newtonmaxit);
~TwoPunctures(); ~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve(); void Solve();
void set_initial_guess(derivs v); void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d); int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
double BY_KKofxyz(double x, double y, double z); double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values); int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R, void LinEquations(double A, double B, double X, double R,
double x, double r, double phi, double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values); double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname); void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu) // provided by Vasileios Paschalidis (vpaschal@illinois.edu)

View File

@@ -186,12 +186,6 @@ void Z4c_class::Step(int lev, int YN)
int ERROR = 0; int ERROR = 0;
MyList<ss_patch> *sPp; MyList<ss_patch> *sPp;
// Pre-build grid segment lists once for this level's patches.
// These are reused across predictor + 3 corrector SyncBegin calls,
// avoiding O(cpusize * blocks^2) rebuild each time.
Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
// Predictor // Predictor
MyList<Patch> *Pp = GH->PatL[lev]; MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
@@ -327,17 +321,13 @@ void Z4c_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// Start async ghost zone exchange - overlaps with error check and Shell computation // check error information
Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -485,7 +475,6 @@ void Z4c_class::Step(int lev, int YN)
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
SH->Dump_Data(StateList, 0, PhysTime, dT_lev); SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -496,8 +485,7 @@ void Z4c_class::Step(int lev, int YN)
} }
#endif #endif
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
if (sync_pre) Parallel::SyncEnd(sync_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -705,17 +693,13 @@ void Z4c_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// Start async ghost zone exchange - overlaps with error check and Shell computation // check error information
Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -873,7 +857,6 @@ void Z4c_class::Step(int lev, int YN)
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -885,8 +868,7 @@ void Z4c_class::Step(int lev, int YN)
} }
#endif #endif
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (sync_cor) Parallel::SyncEnd(sync_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -1060,8 +1042,6 @@ void Z4c_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2]; Porg0[ithBH][2] = Porg1[ithBH][2];
} }
} }
Parallel::SyncFreePlan(sync_plan);
} }
#else #else
// for constraint preserving boundary (CPBC) // for constraint preserving boundary (CPBC)
@@ -1095,10 +1075,6 @@ void Z4c_class::Step(int lev, int YN)
int ERROR = 0; int ERROR = 0;
MyList<ss_patch> *sPp; MyList<ss_patch> *sPp;
// Pre-build grid segment lists once for this level's patches.
Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
// Predictor // Predictor
MyList<Patch> *Pp = GH->PatL[lev]; MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
@@ -1566,17 +1542,13 @@ void Z4c_class::Step(int lev, int YN)
} }
#endif #endif
} }
// Start async ghost zone exchange - overlaps with error check // check error information
Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
SH->Dump_Data(StateList, 0, PhysTime, dT_lev); SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -1586,8 +1558,7 @@ void Z4c_class::Step(int lev, int YN)
} }
} }
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
if (sync_pre) Parallel::SyncEnd(sync_pre);
if (lev == 0) if (lev == 0)
{ {
@@ -2132,17 +2103,13 @@ void Z4c_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Start async ghost zone exchange - overlaps with error check // check error information
Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -2153,8 +2120,7 @@ void Z4c_class::Step(int lev, int YN)
} }
} }
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (sync_cor) Parallel::SyncEnd(sync_cor);
if (lev == 0) if (lev == 0)
{ {
@@ -2380,8 +2346,6 @@ void Z4c_class::Step(int lev, int YN)
DG_List->clearList(); DG_List->clearList();
} }
#endif #endif
Parallel::SyncFreePlan(sync_plan);
} }
#endif #endif
#undef MRBD #undef MRBD

View File

@@ -3035,12 +3035,6 @@ void bssn_class::Step(int lev, int YN)
int ERROR = 0; int ERROR = 0;
MyList<ss_patch> *sPp; MyList<ss_patch> *sPp;
// Pre-build grid segment lists once for this level's patches.
// These are reused across predictor + 3 corrector SyncBegin calls,
// avoiding O(cpusize * blocks^2) rebuild each time.
Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
// Predictor // Predictor
MyList<Patch> *Pp = GH->PatL[lev]; MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
@@ -3164,18 +3158,13 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// check error information
// Start async ghost zone exchange - overlaps with error check and Shell computation
Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -3335,7 +3324,6 @@ void bssn_class::Step(int lev, int YN)
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
SH->Dump_Data(StateList, 0, PhysTime, dT_lev); SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -3346,8 +3334,7 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
if (sync_pre) Parallel::SyncEnd(sync_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3541,10 +3528,7 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// Start async ghost zone exchange - overlaps with error check and Shell computation // check error information
Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
@@ -3552,7 +3536,6 @@ void bssn_class::Step(int lev, int YN)
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -3709,7 +3692,6 @@ void bssn_class::Step(int lev, int YN)
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -3722,8 +3704,7 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (sync_cor) Parallel::SyncEnd(sync_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3914,8 +3895,6 @@ void bssn_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2]; Porg0[ithBH][2] = Porg1[ithBH][2];
} }
} }
Parallel::SyncFreePlan(sync_plan);
} }
//================================================================================================ //================================================================================================
@@ -4838,12 +4817,6 @@ void bssn_class::Step(int lev, int YN)
int ERROR = 0; int ERROR = 0;
MyList<ss_patch> *sPp; MyList<ss_patch> *sPp;
// Pre-build grid segment lists once for this level's patches.
// These are reused across predictor + 3 corrector SyncBegin calls,
// avoiding O(cpusize * blocks^2) rebuild each time.
Parallel::SyncPlan *sync_plan = Parallel::SyncPreparePlan(GH->PatL[lev], Symmetry);
// Predictor // Predictor
MyList<Patch> *Pp = GH->PatL[lev]; MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
@@ -4970,17 +4943,13 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
// Start async ghost zone exchange - overlaps with error check and BH position // check error information
Parallel::SyncHandle *sync_pre = Parallel::SyncBeginWithPlan(sync_plan, SynchList_pre);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_pre); sync_pre = 0;
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -4992,8 +4961,7 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
if (sync_pre) Parallel::SyncEnd(sync_pre);
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -5172,17 +5140,13 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
// Start async ghost zone exchange - overlaps with error check and BH position // check error information
Parallel::SyncHandle *sync_cor = Parallel::SyncBeginWithPlan(sync_plan, SynchList_cor);
// check error information (overlaps with MPI transfer)
{ {
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
if (ERROR) if (ERROR)
{ {
Parallel::SyncEnd(sync_cor); sync_cor = 0;
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -5196,8 +5160,7 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
// Complete async ghost zone exchange Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (sync_cor) Parallel::SyncEnd(sync_cor);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
@@ -5313,8 +5276,6 @@ void bssn_class::Step(int lev, int YN)
// if(myrank==GH->start_rank[lev]) cout<<GH->mylev<<endl; // if(myrank==GH->start_rank[lev]) cout<<GH->mylev<<endl;
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"complet GH Step"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"complet GH Step");
Parallel::SyncFreePlan(sync_plan);
} }
//================================================================================================ //================================================================================================

View File

@@ -106,8 +106,7 @@
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
#endif #endif
!!! sanity check (disabled in production builds for performance) !!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) &
@@ -137,7 +136,6 @@
gont = 1 gont = 1
return return
endif endif
#endif
PI = dacos(-ONE) PI = dacos(-ONE)
@@ -945,60 +943,103 @@
SSA(2)=SYM SSA(2)=SYM
SSA(3)=ANTI SSA(3)=ANTI
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term part
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps) call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps) call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps) call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
!!
#if 1
!! bam does not apply dissipation on gauge variables
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#else
! No dissipation on gauge variables (advection only)
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS) call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
if(eps>0)then
! usual Kreiss-Oliger dissipation
call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "before",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif #endif
call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "after",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
#if 1
!! bam does not apply dissipation on gauge variables
call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
#endif
#endif
endif
if(co == 0)then if(co == 0)then
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho

View File

@@ -19,60 +19,48 @@
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
real*8 :: lgxx,lgyy,lgzz,ldetg real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: ltrA,lscale
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
lgxx = dxx(i,j,k) + ONE detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
lgyy = dyy(i,j,k) + ONE gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
lgzz = dzz(i,j,k) + ONE gupxx = ( gyy * gzz - gyz * gyz ) / detg
gupxy = - ( gxy * gzz - gyz * gxz ) / detg
gupxz = ( gxy * gyz - gyy * gxz ) / detg
gupyy = ( gxx * gzz - gxz * gxz ) / detg
gupyz = - ( gxx * gyz - gxy * gxz ) / detg
gupzz = ( gxx * gyy - gxy * gxy ) / detg
ldetg = lgxx * lgyy * lgzz & trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
- gxz(i,j,k) * lgyy * gxz(i,j,k) &
- gxy(i,j,k) * gxy(i,j,k) * lgzz &
- lgxx * gyz(i,j,k) * gyz(i,j,k)
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg Axx = Axx - F1o3 * gxx * trA
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg Axy = Axy - F1o3 * gxy * trA
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg Axz = Axz - F1o3 * gxz * trA
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg Ayy = Ayy - F1o3 * gyy * trA
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg Ayz = Ayz - F1o3 * gyz * trA
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg Azz = Azz - F1o3 * gzz * trA
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & detg = ONE / ( detg ** F1o3 )
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA gxx = gxx * detg
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA gxy = gxy * detg
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA gxz = gxz * detg
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA gyy = gyy * detg
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA gyz = gyz * detg
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA gzz = gzz * detg
lscale = ONE / ( ldetg ** F1o3 ) dxx = gxx - ONE
dyy = gyy - ONE
dxx(i,j,k) = lgxx * lscale - ONE dzz = gzz - ONE
gxy(i,j,k) = gxy(i,j,k) * lscale
gxz(i,j,k) = gxz(i,j,k) * lscale
dyy(i,j,k) = lgyy * lscale - ONE
gyz(i,j,k) = gyz(i,j,k) * lscale
dzz(i,j,k) = lgzz * lscale - ONE
enddo
enddo
enddo
return return
@@ -95,70 +83,50 @@
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA
real*8 :: lgxx,lgyy,lgzz,lscale real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgxy,lgxz,lgyz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
! for g
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
! for g: normalize determinant first gupzz = ONE / ( gupzz ** F1o3 )
lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE
lgxy = gxy(i,j,k)
lgxz = gxz(i,j,k)
lgyz = gyz(i,j,k)
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & gxx = gxx * gupzz
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & gxy = gxy * gupzz
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz gxz = gxz * gupzz
gyy = gyy * gupzz
gyz = gyz * gupzz
gzz = gzz * gupzz
lscale = ONE / ( lscale ** F1o3 ) dxx = gxx - ONE
dyy = gyy - ONE
dzz = gzz - ONE
! for A
lgxx = lgxx * lscale gupxx = ( gyy * gzz - gyz * gyz )
lgxy = lgxy * lscale gupxy = - ( gxy * gzz - gyz * gxz )
lgxz = lgxz * lscale gupxz = ( gxy * gyz - gyy * gxz )
lgyy = lgyy * lscale gupyy = ( gxx * gzz - gxz * gxz )
lgyz = lgyz * lscale gupyz = - ( gxx * gyz - gxy * gxz )
lgzz = lgzz * lscale gupzz = ( gxx * gyy - gxy * gxy )
dxx(i,j,k) = lgxx - ONE trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
gxy(i,j,k) = lgxy + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
gxz(i,j,k) = lgxz
dyy(i,j,k) = lgyy - ONE
gyz(i,j,k) = lgyz
dzz(i,j,k) = lgzz - ONE
! for A: trace-free using normalized metric (det=1, no division needed) Axx = Axx - F1o3 * gxx * trA
lgupxx = ( lgyy * lgzz - lgyz * lgyz ) Axy = Axy - F1o3 * gxy * trA
lgupxy = - ( lgxy * lgzz - lgyz * lgxz ) Axz = Axz - F1o3 * gxz * trA
lgupxz = ( lgxy * lgyz - lgyy * lgxz ) Ayy = Ayy - F1o3 * gyy * trA
lgupyy = ( lgxx * lgzz - lgxz * lgxz ) Ayz = Ayz - F1o3 * gyz * trA
lgupyz = - ( lgxx * lgyz - lgxy * lgxz ) Azz = Azz - F1o3 * gzz * trA
lgupzz = ( lgxx * lgyy - lgxy * lgxy )
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
enddo
enddo
enddo
return return

View File

@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1112,65 +1118,64 @@ end subroutine d2dump
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polint(xa, ya, x, y, dy, ordn) subroutine polint(xa,ya,x,y,dy,ordn)
implicit none implicit none
integer, intent(in) :: ordn !~~~~~~> Input Parameter:
real*8, dimension(ordn), intent(in) :: xa, ya integer,intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: xa,ya
real*8, intent(in) :: x real*8, intent(in) :: x
real*8, intent(out) :: y, dy real*8, intent(out) :: y,dy
integer :: i, m, ns, n_m !~~~~~~> Other parameter:
real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val
c = ya integer :: m,n,ns
d = ya real*8, dimension(ordn) :: c,d,den,ho
ho = xa - x real*8 :: dif,dift
ns = 1 !~~~~~~>
dif = abs(x - xa(1))
do i = 2, ordn n=ordn
dift = abs(x - xa(i)) m=ordn
if (dift < dif) then
ns = i c=ya
dif = dift d=ya
end if ho=xa-x
ns=1
dif=abs(x-xa(1))
do m=1,n
dift=abs(x-xa(m))
if(dift < dif) then
ns=m
dif=dift
end if
end do end do
y = ya(ns) y=ya(ns)
ns = ns - 1 ns=ns-1
do m=1,n-1
do m = 1, ordn - 1 den(1:n-m)=ho(1:n-m)-ho(1+m:n)
n_m = ordn - m if (any(den(1:n-m) == 0.0))then
do i = 1, n_m write(*,*) 'failure in polint for point',x
hp = ho(i) write(*,*) 'with input points: ',xa
h = ho(i+m) stop
den_val = hp - h endif
den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
if (den_val == 0.0d0) then d(1:n-m)=ho(1+m:n)*den(1:n-m)
write(*,*) 'failure in polint for point',x c(1:n-m)=ho(1:n-m)*den(1:n-m)
write(*,*) 'with input points: ',xa if (2*ns < n-m) then
stop dy=c(ns+1)
end if
den_val = (c(i+1) - d(i)) / den_val
d(i) = h * den_val
c(i) = hp * den_val
end do
if (2 * ns < n_m) then
dy = c(ns + 1)
else else
dy = d(ns) dy=d(ns)
ns = ns - 1 ns=ns-1
end if end if
y = y + dy y=y+dy
end do end do
return return
end subroutine polint end subroutine polint
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! !
@@ -1178,37 +1183,35 @@ end subroutine d2dump
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
implicit none implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a real*8, dimension(1:ordn), intent(in) :: x1a,x2a
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2 real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER !~~~~~~> Other parameters:
integer :: i,m integer :: i,m
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp real*8, dimension(ordn) :: yntmp
m=size(x1a) m=size(x1a)
do i=1,m do i=1,m
yntmp=ya(i,:) yntmp=ya(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do j=1,ordn
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do end do
call polint(x2a, ymtmp, x2, y, dy, ordn)
#endif call polint(x1a,ymtmp,x1,y,dy,ordn)
return return
end subroutine polin2 end subroutine polin2
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! !
@@ -1216,15 +1219,18 @@ end subroutine d2dump
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
implicit none implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2,x3 real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER !~~~~~~> Other parameters:
integer :: i,j,m,n integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
@@ -1233,33 +1239,24 @@ end subroutine d2dump
m=size(x1a) m=size(x1a)
n=size(x2a) n=size(x2a)
do i=1,m do i=1,m
do j=1,n do j=1,n
yqtmp=ya(i,j,:) yqtmp=ya(i,j,:)
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
end do end do
yntmp=yatmp(i,:) yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j, k
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do k=1,ordn
do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
end do
end do end do
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn) call polint(x1a,ymtmp,x1,y,dy,ordn)
end do
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif
return return
end subroutine polin3 end subroutine polin3
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm ! calculate L2norm
@@ -1279,9 +1276,7 @@ end subroutine d2dump
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
dX = X(2) - X(1) dX = X(2) - X(1)
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
@@ -1305,12 +1300,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1335,9 +1325,7 @@ f_out = f_out*dX*dY*dZ
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1400,12 +1388,7 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1433,8 +1416,6 @@ f_out = f_out*dX*dY*dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1497,12 +1478,11 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
f_out = f_out
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
return return
@@ -1700,7 +1680,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: tmp2 real*8, dimension(ORDN,ORDN) :: tmp2
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(3) :: SoAh real*8, dimension(3) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds+1 cxB = inds+1
@@ -1736,21 +1715,20 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
endif endif
! Optimized with BLAS operations for better performance
! First dimension: z-direction weighted sum
tmp2=0 tmp2=0
do m=1,ORDN do m=1,ORDN
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m) tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
enddo enddo
! Second dimension: y-direction weighted sum
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m) tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo enddo
! Third dimension: x-direction weighted sum using BLAS DDOT f_int=0
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return return
@@ -1780,7 +1758,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: ya real*8, dimension(ORDN,ORDN) :: ya
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(2) :: SoAh real*8, dimension(2) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds(1:2)+1 cxB = inds(1:2)+1
@@ -1810,14 +1787,15 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
endif endif
! Optimized with BLAS operations
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m) tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo enddo
! Use BLAS DDOT for final weighted sum f_int=0
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return return
@@ -1848,7 +1826,6 @@ deallocate(f_flat)
real*8, dimension(ORDN) :: ya real*8, dimension(ORDN) :: ya
real*8 :: SoAh real*8 :: SoAh
integer,dimension(3) :: inds integer,dimension(3) :: inds
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
inds = indsi + 1 inds = indsi + 1
@@ -1909,8 +1886,10 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif endif
! Optimized with BLAS DDOT for weighted sum f_int=0
f_int = DDOT(ORDN, coef, 1, ya, 1) do m=1,ORDN
f_int = f_int + coef(m)*ya(m)
enddo
return return
@@ -2142,38 +2121,24 @@ deallocate(f_flat)
end function fWigner_d_function end function fWigner_d_function
!---------------------------------- !----------------------------------
! Optimized factorial function using lookup table for small N
! and log-gamma for large N to avoid overflow
function ffact(N) result(gont) function ffact(N) result(gont)
implicit none implicit none
integer,intent(in) :: N integer,intent(in) :: N
real*8 :: gont real*8 :: gont
integer :: i
! Lookup table for factorials 0! to 20! (precomputed) integer :: i
real*8, parameter, dimension(0:20) :: fact_table = [ &
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
2432902008176640000.d0 ]
! sanity check ! sanity check
if(N < 0)then if(N < 0)then
write(*,*) "ffact: error input for factorial" write(*,*) "ffact: error input for factorial"
gont = 1.d0
return return
endif endif
! Use lookup table for small N (fast path) gont = 1.d0
if(N <= 20)then do i=1,N
gont = fact_table(N) gont = gont*i
else enddo
! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
! This avoids overflow and is computed efficiently
gont = exp(log_gamma(dble(N+1)))
endif
return return

View File

@@ -16,66 +16,115 @@ using namespace std;
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#endif #endif
/* Linear equation solution by Gauss-Jordan elimination.
// Intel oneMKL LAPACK interface
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. corresponding set of solution vectors */
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
int gaussj(double *a, double *b, int n) int gaussj(double *a, double *b, int n)
{ {
// Allocate pivot array and workspace double swap;
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
// Make a copy of matrix a for solving (dgesv modifies it to LU form) int *indxc, *indxr, *ipiv;
double *a_copy = new double[n * n]; indxc = new int[n];
for (int i = 0; i < n * n; i++) { indxr = new int[n];
a_copy[i] = a[i]; ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv, temp;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
}
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
} }
// Step 1: Solve linear system A*x = b using LU decomposition for (l = n - 1; l >= 0; l--)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
if (info != 0) { {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; swap = a[k * n + indxr[l]];
delete[] ipiv; a[k * n + indxr[l]] = a[k * n + indxc[l]];
delete[] a_copy; a[k * n + indxc[l]] = swap;
return 1; }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
} }
delete[] indxc;
delete[] indxr;
delete[] ipiv; delete[] ipiv;
delete[] a_copy;
return 0; return 0;
} }

View File

@@ -512,10 +512,11 @@
IMPLICIT DOUBLE PRECISION (A-H,O-Z) IMPLICIT DOUBLE PRECISION (A-H,O-Z)
DIMENSION V(N),W(N) DIMENSION V(N),W(N)
! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT. ! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
! Optimized using Intel oneMKL BLAS ddot
! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
DOUBLE PRECISION, EXTERNAL :: DDOT SUM = 0.0D0
DGVV = DDOT(N, V, 1, W, 1) DO 10 I = 1,N
SUM = SUM + V(I)*W(I)
10 CONTINUE
DGVV = SUM
RETURN RETURN
END END

View File

@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
end subroutine lopsided end subroutine lopsided
!-----------------------------------------------------------------------------
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
! Mathematically identical to calling lopsided then kodis separately.
!-----------------------------------------------------------------------------
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
implicit none
!~~~~~~> Input parameters:
integer, intent(in) :: ex(1:3),Symmetry
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,intent(in) :: eps
!~~~~~~> local variables:
! note index -2,-1,0, so we have 3 extra points
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
! kodis parameters
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8, parameter :: cof=6.4d1 ! 2^6
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
! Single symmetry_bd call shared by both advection and dissipation
call symmetry_bd(3,ex,f,fh,SoA)
! ---- Advection (lopsided) loop ----
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
endif
elseif(Sfx(i,j,k) < ZEO)then
if(i-3 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
endif
endif
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
endif
elseif(Sfy(i,j,k) < ZEO)then
if(j-3 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
endif
endif
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
endif
elseif(Sfz(i,j,k) < ZEO)then
if(k-3 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
enddo
enddo
enddo
! ---- Dissipation (kodis) loop ----
if(eps > ZEO) then
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo
enddo
enddo
endif
return
end subroutine lopsided_kodis
#elif (ghost_width == 4) #elif (ghost_width == 4)
! sixth order code ! sixth order code
! Compute advection terms in right hand sides of field equations ! Compute advection terms in right hand sides of field equations

View File

@@ -2,7 +2,7 @@
#ifndef MICRODEF_H #ifndef MICRODEF_H
#define MICRODEF_H #define MICRODEF_H
#include "macrodef.fh" #include "microdef.fh"
// application parameters // application parameters

View File

@@ -16,12 +16,6 @@ include makefile.inc
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -1,29 +1,19 @@
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## LDLIBS = -L/usr/lib/x86_64-linux-gnu -lmpich -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags: CXXAPPFLAGS = -O0 -Wno-deprecated -Dfortran3 -Dnewc
## -O3: Maximum optimization #f90appflags = -O0 -fpp
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) f90appflags = -O0 -x f95-cpp-input
## -fp-model fast=2: Aggressive floating-point optimizations f90 = gfortran
## -fma: Enable fused multiply-add instructions f77 = gfortran
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ CXX = g++
-Dfortran3 -Dnewc -I${MKLROOT}/include CC = gcc
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ CLINKER = mpic++
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -10,17 +10,6 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
import time
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 0-111"
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104
################################################################## ##################################################################
@@ -37,11 +26,11 @@ def makefile_ABE():
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" makefile_command = "make -j96" + " ABE"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" makefile_command = "make -j4" + " ABEGPU"
else: else:
print( " CPU/GPU numerical calculation setting is wrong " ) print( " CPU/GPU numerical calculation setting is wrong " )
print( ) print( )
@@ -78,8 +67,8 @@ def makefile_TwoPunctureABE():
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Build command with CPU binding to nohz_full cores ## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" makefile_command = "make" + " TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
@@ -116,10 +105,10 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
@@ -152,13 +141,13 @@ def run_ABE():
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
def run_TwoPunctureABE(): def run_TwoPunctureABE():
tp_time1=time.time()
print( ) print( )
print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Define the command to run ## Define the command to run
TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" TwoPuncture_command = "./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log" TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
@@ -179,9 +168,7 @@ def run_TwoPunctureABE():
print( ) print( )
print( " The TwoPunctureABE simulation is finished " ) print( " The TwoPunctureABE simulation is finished " )
print( ) print( )
tp_time2=time.time()
et=tp_time2-tp_time1
print(f"Used time: {et}")
return return
################################################################## ##################################################################

View File

@@ -1,29 +0,0 @@
import multiprocessing
def run_plot_task(task):
"""Execute a single plotting task.
Parameters
----------
task : tuple
A tuple of (function, args_tuple) where function is a callable
plotting function and args_tuple contains its arguments.
"""
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
"""Execute a list of independent plotting tasks in parallel.
Uses the 'fork' context to create worker processes so that the main
script is NOT re-imported/re-executed in child processes.
Parameters
----------
plot_tasks : list of tuples
Each element is (function, args_tuple).
"""
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

View File

@@ -11,8 +11,6 @@
import numpy ## numpy for array operations import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing import scipy ## scipy for interpolation and signal processing
import math import math
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations import os ## os for system/file operations

View File

@@ -8,23 +8,16 @@
## ##
################################################# #################################################
## Restrict OpenMP to one thread per process so that running
## many workers in parallel does not create an O(workers * BLAS_threads)
## thread explosion. The variable MUST be set before numpy/scipy
## are imported, because the BLAS library reads them only at load time.
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
import numpy import numpy
import scipy import scipy
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import Axes3D
## import torch ## import torch
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
######################################################################################### #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
#################################################################################### ####################################################################################
####################################################################################
## Allow this module to be run as a standalone script so that each
## binary-data plot can be executed in a fresh subprocess whose BLAS
## environment variables (set above) take effect before numpy loads.
##
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
####################################################################################
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -8,8 +8,6 @@
################################################# #################################################
import numpy ## numpy for array operations import numpy ## numpy for array operations
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob import glob
@@ -17,9 +15,6 @@ import os ## operating system utilities
import plot_binary_data import plot_binary_data
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots # plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x) file_list.append(x)
print(x) print(x)
## Plot each file in parallel using subprocesses. ## Plot each file in the list
## Each subprocess is a fresh Python process where the BLAS thread-count
## environment variables (set at the top of plot_binary_data.py) take
## effect before numpy is imported. This avoids the thread explosion
## that occurs when multiprocessing.Pool with 'fork' context inherits
## already-initialized multi-threaded BLAS from the parent.
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
running = []
failed = []
for filename in file_list: for filename in file_list:
print(filename) print(filename)
proc = subprocess.Popen( plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
## Keep at most max_workers subprocesses active at a time
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
## Wait for all remaining subprocesses to finish
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
print( ) print( )
print( " Binary Data Plot Has been Finished " ) print( " Binary Data Plot Has been Finished " )