diff --git a/.gitignore b/.gitignore index 1a4ec9b..063cdec 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ __pycache__ GW150914 +GW150914-origin +docs +*.tmp + diff --git a/AMSS_NCKU_Program.py b/AMSS_NCKU_Program.py index 46d15f1..6a7952a 100755 --- a/AMSS_NCKU_Program.py +++ b/AMSS_NCKU_Program.py @@ -8,6 +8,14 @@ ## ################################################################## +## Guard against re-execution by multiprocessing child processes. +## Without this, using 'spawn' or 'forkserver' context would cause every +## worker to re-run the entire script, spawning exponentially more +## workers (fork bomb). +if __name__ != '__main__': + import sys as _sys + _sys.exit(0) + ################################################################## @@ -424,26 +432,31 @@ print( import plot_xiaoqu import plot_GW_strain_amplitude_xiaoqu +from parallel_plot_helper import run_plot_tasks_parallel + +plot_tasks = [] ## Plot black hole trajectory -plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory ) -plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory ) +plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) ) +plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) ) ## Plot black hole separation vs. time -plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory ) +plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) ) ## Plot gravitational waveforms (psi4 and strain amplitude) for i in range(input_data.Detector_Number): - plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i ) - plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i ) + plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) ) + plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) ) ## Plot ADM mass evolution for i in range(input_data.Detector_Number): - plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i ) + plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) ) ## Plot Hamiltonian constraint violation over time for i in range(input_data.grid_level): - plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i ) + plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) ) + +run_plot_tasks_parallel(plot_tasks) ## Plot stored binary data plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory ) diff --git a/AMSS_NCKU_Verify_ASC26.py b/AMSS_NCKU_Verify_ASC26.py new file mode 100644 index 0000000..ed386e7 --- /dev/null +++ b/AMSS_NCKU_Verify_ASC26.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +""" +AMSS-NCKU GW150914 Simulation Regression Test Script + +Verification Requirements: +1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2) +2. ADM constraint violation < 2 (Grid Level 0) + +RMS Calculation Method: +- Computes trajectory deviation on the XY plane independently for BH1 and BH2 +- For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100% +- Final RMS = max(RMS_BH1, RMS_BH2) + +Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir] +Default: output_dir = GW150914/AMSS_NCKU_output +Reference: GW150914-origin (baseline simulation) +""" + +import numpy as np +import sys +import os + +# ANSI Color Codes +class Color: + GREEN = '\033[92m' + RED = '\033[91m' + YELLOW = '\033[93m' + BLUE = '\033[94m' + BOLD = '\033[1m' + RESET = '\033[0m' + +def get_status_text(passed): + if passed: + return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}" + else: + return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}" + +def load_bh_trajectory(filepath): + """Load black hole trajectory data""" + data = np.loadtxt(filepath) + return { + 'time': data[:, 0], + 'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3], + 'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6] + } + + +def load_constraint_data(filepath): + """Load constraint violation data""" + data = [] + with open(filepath, 'r') as f: + for line in f: + if line.startswith('#'): + continue + parts = line.split() + if len(parts) >= 8: + data.append([float(x) for x in parts[:8]]) + return np.array(data) + + +def calculate_rms_error(bh_data_ref, bh_data_target): + """ + Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations. + + This function computes the RMS error independently for BH1 and BH2 trajectories, + then returns the maximum of the two as the final RMS error metric. + + For each black hole, the RMS is calculated as: + RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100% + + where: + Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2) + r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2)) + + Args: + bh_data_ref: Reference (baseline) trajectory data + bh_data_target: Target (optimized) trajectory data + + Returns: + rms_value: Final RMS error as a percentage (max of BH1 and BH2) + error: Error message if any + """ + # Align data: truncate to the length of the shorter dataset + M = min(len(bh_data_ref['time']), len(bh_data_target['time'])) + + if M < 10: + return None, "Insufficient data points for comparison" + + # Extract XY coordinates for both black holes + x1_ref = bh_data_ref['x1'][:M] + y1_ref = bh_data_ref['y1'][:M] + x2_ref = bh_data_ref['x2'][:M] + y2_ref = bh_data_ref['y2'][:M] + + x1_new = bh_data_target['x1'][:M] + y1_new = bh_data_target['y1'][:M] + x2_new = bh_data_target['x2'][:M] + y2_new = bh_data_target['y2'][:M] + + # Calculate RMS for BH1 + delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2) + r1_ref = np.sqrt(x1_ref**2 + y1_ref**2) + r1_new = np.sqrt(x1_new**2 + y1_new**2) + r1_max = np.maximum(r1_ref, r1_new) + + # Calculate RMS for BH2 + delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2) + r2_ref = np.sqrt(x2_ref**2 + y2_ref**2) + r2_new = np.sqrt(x2_new**2 + y2_new**2) + r2_max = np.maximum(r2_ref, r2_new) + + # Avoid division by zero for BH1 + valid_mask1 = r1_max > 1e-15 + if np.sum(valid_mask1) < 10: + return None, "Insufficient valid data points for BH1" + + terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2 + rms_bh1 = np.sqrt(np.mean(terms1)) * 100 + + # Avoid division by zero for BH2 + valid_mask2 = r2_max > 1e-15 + if np.sum(valid_mask2) < 10: + return None, "Insufficient valid data points for BH2" + + terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2 + rms_bh2 = np.sqrt(np.mean(terms2)) * 100 + + # Final RMS is the maximum of BH1 and BH2 + rms_final = max(rms_bh1, rms_bh2) + + return rms_final, None + + +def analyze_constraint_violation(constraint_data, n_levels=9): + """ + Analyze ADM constraint violation + Return maximum constraint violation for Grid Level 0 + """ + # Extract Grid Level 0 data (first entry for each time step) + level0_data = constraint_data[::n_levels] + + # Calculate maximum absolute value for each constraint + results = { + 'Ham': np.max(np.abs(level0_data[:, 1])), + 'Px': np.max(np.abs(level0_data[:, 2])), + 'Py': np.max(np.abs(level0_data[:, 3])), + 'Pz': np.max(np.abs(level0_data[:, 4])), + 'Gx': np.max(np.abs(level0_data[:, 5])), + 'Gy': np.max(np.abs(level0_data[:, 6])), + 'Gz': np.max(np.abs(level0_data[:, 7])) + } + + results['max_violation'] = max(results.values()) + return results + + +def print_header(): + """Print report header""" + print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) + print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET) + print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) + + +def print_rms_results(rms_rel, error, threshold=1.0): + """Print RMS error results""" + print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}") + print("-" * 45) + + if error: + print(f" {Color.RED}Error: {error}{Color.RESET}") + return False + + passed = rms_rel < threshold + + print(f" RMS relative error: {rms_rel:.4f}%") + print(f" Requirement: < {threshold}%") + print(f" Status: {get_status_text(passed)}") + + return passed + + +def print_constraint_results(results, threshold=2.0): + """Print constraint violation results""" + print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}") + print("-" * 45) + + names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz'] + for i, name in enumerate(names): + print(f" Max |{name:3}|: {results[name]:.6f}", end=" ") + if (i + 1) % 2 == 0: print() + if len(names) % 2 != 0: print() + + passed = results['max_violation'] < threshold + + print(f"\n Maximum violation: {results['max_violation']:.6f}") + print(f" Requirement: < {threshold}") + print(f" Status: {get_status_text(passed)}") + + return passed + + +def print_summary(rms_passed, constraint_passed): + """Print summary""" + print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) + print(Color.BOLD + "Verification Summary" + Color.RESET) + print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) + + all_passed = rms_passed and constraint_passed + + res_rms = get_status_text(rms_passed) + res_con = get_status_text(constraint_passed) + + print(f" [1] RMS trajectory check: {res_rms}") + print(f" [2] ADM constraint check: {res_con}") + + final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}" + print(f"\n Overall result: {final_status}") + print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n") + + return all_passed + + +def main(): + # Determine target (optimized) output directory + if len(sys.argv) > 1: + target_dir = sys.argv[1] + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output") + + # Determine reference (baseline) directory + script_dir = os.path.dirname(os.path.abspath(__file__)) + reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output") + + # Data file paths + bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat") + bh_file_target = os.path.join(target_dir, "bssn_BH.dat") + constraint_file = os.path.join(target_dir, "bssn_constraint.dat") + + # Check if files exist + if not os.path.exists(bh_file_ref): + print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}") + sys.exit(1) + + if not os.path.exists(bh_file_target): + print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}") + sys.exit(1) + + if not os.path.exists(constraint_file): + print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}") + sys.exit(1) + + # Print header + print_header() + print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}") + print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}") + + # Load data + bh_data_ref = load_bh_trajectory(bh_file_ref) + bh_data_target = load_bh_trajectory(bh_file_target) + constraint_data = load_constraint_data(constraint_file) + + # Calculate RMS error + rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target) + rms_passed = print_rms_results(rms_rel, error) + + # Analyze constraint violation + constraint_results = analyze_constraint_violation(constraint_data) + constraint_passed = print_constraint_results(constraint_results) + + # Print summary + all_passed = print_summary(rms_passed, constraint_passed) + + # Return exit code + sys.exit(0 if all_passed else 1) + + +if __name__ == "__main__": + main() diff --git a/AMSS_NCKU_source/FFT.f90 b/AMSS_NCKU_source/FFT.f90 index 3c4a12c..7dfe727 100644 --- a/AMSS_NCKU_source/FFT.f90 +++ b/AMSS_NCKU_source/FFT.f90 @@ -37,57 +37,51 @@ close(77) end program checkFFT #endif +!------------- +! Optimized FFT using Intel oneMKL DFTI +! Mathematical equivalence: Standard DFT definition +! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) +! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) +! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] !------------- SUBROUTINE four1(dataa,nn,isign) +use MKL_DFTI implicit none -INTEGER::isign,nn -double precision,dimension(2*nn)::dataa -INTEGER::i,istep,j,m,mmax,n -double precision::tempi,tempr -DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp -n=2*nn -j=1 -do i=1,n,2 - if(j.gt.i)then - tempr=dataa(j) - tempi=dataa(j+1) - dataa(j)=dataa(i) - dataa(j+1)=dataa(i+1) - dataa(i)=tempr - dataa(i+1)=tempi - endif - m=nn -1 if ((m.ge.2).and.(j.gt.m)) then - j=j-m - m=m/2 -goto 1 - endif -j=j+m -enddo -mmax=2 -2 if (n.gt.mmax) then - istep=2*mmax - theta=6.28318530717959d0/(isign*mmax) - wpr=-2.d0*sin(0.5d0*theta)**2 - wpi=sin(theta) - wr=1.d0 - wi=0.d0 - do m=1,mmax,2 - do i=m,n,istep - j=i+mmax - tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1) - tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j) - dataa(j)=dataa(i)-tempr - dataa(j+1)=dataa(i+1)-tempi - dataa(i)=dataa(i)+tempr - dataa(i+1)=dataa(i+1)+tempi - enddo - wtemp=wr - wr=wr*wpr-wi*wpi+wr - wi=wi*wpr+wtemp*wpi+wi - enddo -mmax=istep -goto 2 +INTEGER, intent(in) :: isign, nn +DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa + +type(DFTI_DESCRIPTOR), pointer :: desc +integer :: status + +! Create DFTI descriptor for 1D complex-to-complex transform +status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) +if (status /= 0) return + +! Set input/output storage as interleaved complex (default) +status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) +if (status /= 0) then + status = DftiFreeDescriptor(desc) + return endif + +! Commit the descriptor +status = DftiCommitDescriptor(desc) +if (status /= 0) then + status = DftiFreeDescriptor(desc) + return +endif + +! Execute FFT based on direction +if (isign == 1) then + ! Forward FFT: exp(-2*pi*i*k*n/N) + status = DftiComputeForward(desc, dataa) +else + ! Backward FFT: exp(+2*pi*i*k*n/N) + status = DftiComputeBackward(desc, dataa) +endif + +! Free descriptor +status = DftiFreeDescriptor(desc) + return END SUBROUTINE four1 diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index f0deb56..91ead8a 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -341,8 +341,9 @@ void Patch::Interp_Points(MyList *VarList, double *Shellf, int Symmetry) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine - int myrank; + int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int ordn = 2 * ghost_width; MyList *varl; @@ -354,24 +355,18 @@ void Patch::Interp_Points(MyList *VarList, varl = varl->next; } - double *shellf; - shellf = new double[NN * num_var]; - memset(shellf, 0, sizeof(double) * NN * num_var); + memset(Shellf, 0, sizeof(double) * NN * num_var); - // we use weight to monitor code, later some day we can move it for optimization - int *weight; - weight = new int[NN]; - memset(weight, 0, sizeof(int) * NN); - - double *DH, *llb, *uub; - DH = new double[dim]; + // owner_rank[j] records which MPI rank owns point j + // All ranks traverse the same block list so they all agree on ownership + int *owner_rank; + owner_rank = new int[NN]; + for (int j = 0; j < NN; j++) + owner_rank[j] = -1; + double DH[dim], llb[dim], uub[dim]; for (int i = 0; i < dim; i++) - { DH[i] = getdX(i); - } - llb = new double[dim]; - uub = new double[dim]; for (int j = 0; j < NN; j++) // run along points { @@ -403,12 +398,6 @@ void Patch::Interp_Points(MyList *VarList, bool flag = true; for (int i = 0; i < dim; i++) { -// NOTE: our dividing structure is (exclude ghost) -// -1 0 -// 1 2 -// so (0,1) does not belong to any part for vertex structure -// here we put (0,0.5) to left part and (0.5,1) to right part -// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined @@ -433,6 +422,7 @@ void Patch::Interp_Points(MyList *VarList, if (flag) { notfind = false; + owner_rank[j] = BP->rank; if (myrank == BP->rank) { //---> interpolation @@ -440,14 +430,11 @@ void Patch::Interp_Points(MyList *VarList, int k = 0; while (varl) // run along variables { - // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], - // pox,ordn,varl->data->SoA,Symmetry); - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k], + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); varl = varl->next; k++; } - weight[j] = 1; } } if (Bp == ble) @@ -456,103 +443,327 @@ void Patch::Interp_Points(MyList *VarList, } } - MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - int *Weight; - Weight = new int[NN]; - MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - - // misc::tillherecheck("print me"); - - for (int i = 0; i < NN; i++) + // Replace MPI_Allreduce with per-owner MPI_Bcast: + // Group consecutive points by owner rank and broadcast each group. + // Since each point's data is non-zero only on the owner rank, + // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper. { - if (Weight[i] > 1) + int j = 0; + while (j < NN) { - if (myrank == 0) - cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl; - for (int j = 0; j < num_var; j++) - Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i]; + int cur_owner = owner_rank[j]; + if (cur_owner < 0) + { + if (myrank == 0) + { + cout << "ERROR: Patch::Interp_Points fails to find point ("; + for (int d = 0; d < dim; d++) + { + cout << XX[d][j]; + if (d < dim - 1) + cout << ","; + else + cout << ")"; + } + cout << " on Patch ("; + for (int d = 0; d < dim; d++) + { + cout << bbox[d] << "+" << lli[d] * DH[d]; + if (d < dim - 1) + cout << ","; + else + cout << ")--"; + } + cout << "("; + for (int d = 0; d < dim; d++) + { + cout << bbox[dim + d] << "-" << uui[d] * DH[d]; + if (d < dim - 1) + cout << ","; + else + cout << ")" << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + j++; + continue; + } + // Find contiguous run of points with the same owner + int jstart = j; + while (j < NN && owner_rank[j] == cur_owner) + j++; + int count = (j - jstart) * num_var; + MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); } - else if (Weight[i] == 0 && myrank == 0) + } + + delete[] owner_rank; +} +void Patch::Interp_Points(MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry, + int Nmin_consumer, int Nmax_consumer) +{ + // Targeted point-to-point overload: each owner sends each point only to + // the one rank that needs it for integration (consumer), reducing + // communication volume by ~nprocs times compared to the Bcast version. + int myrank, nprocs; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + int ordn = 2 * ghost_width; + MyList *varl; + int num_var = 0; + varl = VarList; + while (varl) + { + num_var++; + varl = varl->next; + } + + memset(Shellf, 0, sizeof(double) * NN * num_var); + + // owner_rank[j] records which MPI rank owns point j + int *owner_rank; + owner_rank = new int[NN]; + for (int j = 0; j < NN; j++) + owner_rank[j] = -1; + + double DH[dim], llb[dim], uub[dim]; + for (int i = 0; i < dim; i++) + DH[i] = getdX(i); + + // --- Interpolation phase (identical to original) --- + for (int j = 0; j < NN; j++) + { + double pox[dim]; + for (int i = 0; i < dim; i++) + { + pox[i] = XX[i][j]; + if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i])) + { + cout << "Patch::Interp_Points: point ("; + for (int k = 0; k < dim; k++) + { + cout << XX[k][j]; + if (k < dim - 1) + cout << ","; + else + cout << ") is out of current Patch." << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + MyList *Bp = blb; + bool notfind = true; + while (notfind && Bp) + { + Block *BP = Bp->data; + + bool flag = true; + for (int i = 0; i < dim; i++) + { +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; + uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; +#else +#ifdef Cell + llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; + uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2) + { + flag = false; + break; + } + } + + if (flag) + { + notfind = false; + owner_rank[j] = BP->rank; + if (myrank == BP->rank) + { + varl = VarList; + int k = 0; + while (varl) + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; + } + } + } + if (Bp == ble) + break; + Bp = Bp->next; + } + } + + // --- Error check for unfound points --- + for (int j = 0; j < NN; j++) + { + if (owner_rank[j] < 0 && myrank == 0) { cout << "ERROR: Patch::Interp_Points fails to find point ("; - for (int j = 0; j < dim; j++) + for (int d = 0; d < dim; d++) { - cout << XX[j][i]; - if (j < dim - 1) + cout << XX[d][j]; + if (d < dim - 1) cout << ","; else cout << ")"; } cout << " on Patch ("; - for (int j = 0; j < dim; j++) + for (int d = 0; d < dim; d++) { - cout << bbox[j] << "+" << lli[j] * getdX(j); - if (j < dim - 1) + cout << bbox[d] << "+" << lli[d] * DH[d]; + if (d < dim - 1) cout << ","; else cout << ")--"; } cout << "("; - for (int j = 0; j < dim; j++) + for (int d = 0; d < dim; d++) { - cout << bbox[dim + j] << "-" << uui[j] * getdX(j); - if (j < dim - 1) + cout << bbox[dim + d] << "-" << uui[d] * DH[d]; + if (d < dim - 1) cout << ","; else cout << ")" << endl; } -#if 0 - checkBlock(); -#else - cout << "splited domains:" << endl; - { - MyList *Bp = blb; - while (Bp) - { - Block *BP = Bp->data; - - for (int i = 0; i < dim; i++) - { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; -#else -#ifdef Cell - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - cout << "("; - for (int j = 0; j < dim; j++) - { - cout << llb[j] << ":" << uub[j]; - if (j < dim - 1) - cout << ","; - else - cout << ")" << endl; - } - if (Bp == ble) - break; - Bp = Bp->next; - } - } -#endif MPI_Abort(MPI_COMM_WORLD, 1); } } - delete[] shellf; - delete[] weight; - delete[] Weight; - delete[] DH; - delete[] llb; - delete[] uub; + // --- Targeted point-to-point communication phase --- + // Compute consumer_rank[j] using the same deterministic formula as surface_integral + int *consumer_rank = new int[NN]; + { + int mp = NN / nprocs; + int Lp = NN - nprocs * mp; + for (int j = 0; j < NN; j++) + { + if (j < Lp * (mp + 1)) + consumer_rank[j] = j / (mp + 1); + else + consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp; + } + } + + // Count sends and recvs per rank + int *send_count = new int[nprocs]; + int *recv_count = new int[nprocs]; + memset(send_count, 0, sizeof(int) * nprocs); + memset(recv_count, 0, sizeof(int) * nprocs); + + for (int j = 0; j < NN; j++) + { + int own = owner_rank[j]; + int con = consumer_rank[j]; + if (own == con) + continue; // local — no communication needed + if (own == myrank) + send_count[con]++; + if (con == myrank) + recv_count[own]++; + } + + // Build send buffers: for each destination rank, pack (index, data) pairs + // Each entry: 1 int (point index j) + num_var doubles + int total_send = 0, total_recv = 0; + int *send_offset = new int[nprocs]; + int *recv_offset = new int[nprocs]; + for (int r = 0; r < nprocs; r++) + { + send_offset[r] = total_send; + total_send += send_count[r]; + recv_offset[r] = total_recv; + total_recv += recv_count[r]; + } + + // Pack send buffers: each message contains (j, data[0..num_var-1]) per point + int stride = 1 + num_var; // 1 double for index + num_var doubles for data + double *sendbuf = new double[total_send * stride]; + double *recvbuf = new double[total_recv * stride]; + + // Temporary counters for packing + int *pack_pos = new int[nprocs]; + memset(pack_pos, 0, sizeof(int) * nprocs); + + for (int j = 0; j < NN; j++) + { + int own = owner_rank[j]; + int con = consumer_rank[j]; + if (own != myrank || con == myrank) + continue; + int pos = (send_offset[con] + pack_pos[con]) * stride; + sendbuf[pos] = (double)j; // point index + for (int v = 0; v < num_var; v++) + sendbuf[pos + 1 + v] = Shellf[j * num_var + v]; + pack_pos[con]++; + } + + // Post non-blocking recvs and sends + int n_req = 0; + for (int r = 0; r < nprocs; r++) + { + if (recv_count[r] > 0) n_req++; + if (send_count[r] > 0) n_req++; + } + + MPI_Request *reqs = new MPI_Request[n_req]; + int req_idx = 0; + + for (int r = 0; r < nprocs; r++) + { + if (recv_count[r] > 0) + { + MPI_Irecv(recvbuf + recv_offset[r] * stride, + recv_count[r] * stride, MPI_DOUBLE, + r, 0, MPI_COMM_WORLD, &reqs[req_idx++]); + } + } + for (int r = 0; r < nprocs; r++) + { + if (send_count[r] > 0) + { + MPI_Isend(sendbuf + send_offset[r] * stride, + send_count[r] * stride, MPI_DOUBLE, + r, 0, MPI_COMM_WORLD, &reqs[req_idx++]); + } + } + + if (n_req > 0) + MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE); + + // Unpack recv buffers into Shellf + for (int i = 0; i < total_recv; i++) + { + int pos = i * stride; + int j = (int)recvbuf[pos]; + for (int v = 0; v < num_var; v++) + Shellf[j * num_var + v] = recvbuf[pos + 1 + v]; + } + + delete[] reqs; + delete[] sendbuf; + delete[] recvbuf; + delete[] pack_pos; + delete[] send_offset; + delete[] recv_offset; + delete[] send_count; + delete[] recv_count; + delete[] consumer_rank; + delete[] owner_rank; } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, @@ -573,24 +784,22 @@ void Patch::Interp_Points(MyList *VarList, varl = varl->next; } - double *shellf; - shellf = new double[NN * num_var]; - memset(shellf, 0, sizeof(double) * NN * num_var); + memset(Shellf, 0, sizeof(double) * NN * num_var); - // we use weight to monitor code, later some day we can move it for optimization - int *weight; - weight = new int[NN]; - memset(weight, 0, sizeof(int) * NN); + // owner_rank[j] stores the global rank that owns point j + int *owner_rank; + owner_rank = new int[NN]; + for (int j = 0; j < NN; j++) + owner_rank[j] = -1; - double *DH, *llb, *uub; - DH = new double[dim]; + // Build global-to-local rank translation for Comm_here + MPI_Group world_group, local_group; + MPI_Comm_group(MPI_COMM_WORLD, &world_group); + MPI_Comm_group(Comm_here, &local_group); + double DH[dim], llb[dim], uub[dim]; for (int i = 0; i < dim; i++) - { DH[i] = getdX(i); - } - llb = new double[dim]; - uub = new double[dim]; for (int j = 0; j < NN; j++) // run along points { @@ -622,12 +831,6 @@ void Patch::Interp_Points(MyList *VarList, bool flag = true; for (int i = 0; i < dim; i++) { -// NOTE: our dividing structure is (exclude ghost) -// -1 0 -// 1 2 -// so (0,1) does not belong to any part for vertex structure -// here we put (0,0.5) to left part and (0.5,1) to right part -// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined @@ -652,6 +855,7 @@ void Patch::Interp_Points(MyList *VarList, if (flag) { notfind = false; + owner_rank[j] = BP->rank; if (myrank == BP->rank) { //---> interpolation @@ -659,14 +863,11 @@ void Patch::Interp_Points(MyList *VarList, int k = 0; while (varl) // run along variables { - // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], - // pox,ordn,varl->data->SoA,Symmetry); - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k], + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); varl = varl->next; k++; } - weight[j] = 1; } } if (Bp == ble) @@ -675,97 +876,35 @@ void Patch::Interp_Points(MyList *VarList, } } - MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here); - int *Weight; - Weight = new int[NN]; - MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here); - - // misc::tillherecheck("print me"); - // if(lmyrank == 0) cout<<"myrank = "< 1) + int j = 0; + while (j < NN) { - if (lmyrank == 0) - cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl; - for (int j = 0; j < num_var; j++) - Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i]; + int cur_owner_global = owner_rank[j]; + if (cur_owner_global < 0) + { + // Point not found — skip (error check disabled for sub-communicator levels) + j++; + continue; + } + // Translate global rank to local rank in Comm_here + int cur_owner_local; + MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local); + + // Find contiguous run of points with the same owner + int jstart = j; + while (j < NN && owner_rank[j] == cur_owner_global) + j++; + int count = (j - jstart) * num_var; + MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here); } -#if 0 // for not involved levels, this may fail - else if(Weight[i] == 0 && lmyrank == 0) - { - cout<<"ERROR: Patch::Interp_Points fails to find point ("; - for(int j=0;j *Bp=blb; - while(Bp) - { - Block *BP=Bp->data; - - for(int i=0;ibbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +(ghost_width-0.5)*DH[i]; - uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i]; -#else -#ifdef Cell - llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +ghost_width*DH[i]; - uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - } - cout<<"("; - for(int j=0;jnext; - } - } -#endif - MPI_Abort(MPI_COMM_WORLD,1); - } -#endif } - delete[] shellf; - delete[] weight; - delete[] Weight; - delete[] DH; - delete[] llb; - delete[] uub; + MPI_Group_free(&world_group); + MPI_Group_free(&local_group); + delete[] owner_rank; } void Patch::checkBlock() { diff --git a/AMSS_NCKU_source/MPatch.h b/AMSS_NCKU_source/MPatch.h index ca19ca5..b993be6 100644 --- a/AMSS_NCKU_source/MPatch.h +++ b/AMSS_NCKU_source/MPatch.h @@ -39,6 +39,10 @@ public: bool Find_Point(double *XX); + void Interp_Points(MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry, + int Nmin_consumer, int Nmax_consumer); void Interp_Points(MyList *VarList, int NN, double **XX, double *Shellf, int Symmetry, MPI_Comm Comm_here); diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 713a6a7..a9fb3cd 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3756,6 +3756,502 @@ void Parallel::Sync(MyList *PatL, MyList *VarList, int Symmetry) delete[] transfer_src; delete[] transfer_dst; } +// Merged Sync: collect all intra-patch and inter-patch grid segment lists, +// then issue a single transfer() call instead of N+1 separate ones. +void Parallel::Sync_merged(MyList *PatL, MyList *VarList, int Symmetry) +{ + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + + MyList **combined_src = new MyList *[cpusize]; + MyList **combined_dst = new MyList *[cpusize]; + for (int node = 0; node < cpusize; node++) + combined_src[node] = combined_dst[node] = 0; + + // Phase A: Intra-patch ghost exchange segments + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + + if (dst_ghost) + dst_ghost->destroyList(); + + Pp = Pp->next; + } + + // Phase B: Inter-patch buffer exchange segments + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + + if (tsrc) + { + if (combined_src[node]) + combined_src[node]->catList(tsrc); + else + combined_src[node] = tsrc; + } + if (tdst) + { + if (combined_dst[node]) + combined_dst[node]->catList(tdst); + else + combined_dst[node] = tdst; + } + + if (src_owned) + src_owned->destroyList(); + } + if (dst_buffer) + dst_buffer->destroyList(); + + // Phase C: Single transfer + transfer(combined_src, combined_dst, VarList, VarList, Symmetry); + + // Phase D: Cleanup + for (int node = 0; node < cpusize; node++) + { + if (combined_src[node]) + combined_src[node]->destroyList(); + if (combined_dst[node]) + combined_dst[node]->destroyList(); + } + delete[] combined_src; + delete[] combined_dst; +} +// SyncCache constructor +Parallel::SyncCache::SyncCache() + : valid(false), cpusize(0), combined_src(0), combined_dst(0), + send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), + send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0), + lengths_valid(false) +{ +} +// SyncCache invalidate: free grid segment lists but keep buffers +void Parallel::SyncCache::invalidate() +{ + if (!valid) + return; + for (int i = 0; i < cpusize; i++) + { + if (combined_src[i]) + combined_src[i]->destroyList(); + if (combined_dst[i]) + combined_dst[i]->destroyList(); + combined_src[i] = combined_dst[i] = 0; + send_lengths[i] = recv_lengths[i] = 0; + } + valid = false; + lengths_valid = false; +} +// SyncCache destroy: free everything +void Parallel::SyncCache::destroy() +{ + invalidate(); + if (combined_src) delete[] combined_src; + if (combined_dst) delete[] combined_dst; + if (send_lengths) delete[] send_lengths; + if (recv_lengths) delete[] recv_lengths; + if (send_buf_caps) delete[] send_buf_caps; + if (recv_buf_caps) delete[] recv_buf_caps; + for (int i = 0; i < cpusize; i++) + { + if (send_bufs && send_bufs[i]) delete[] send_bufs[i]; + if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i]; + } + if (send_bufs) delete[] send_bufs; + if (recv_bufs) delete[] recv_bufs; + if (reqs) delete[] reqs; + if (stats) delete[] stats; + combined_src = combined_dst = 0; + send_lengths = recv_lengths = 0; + send_buf_caps = recv_buf_caps = 0; + send_bufs = recv_bufs = 0; + reqs = 0; stats = 0; + cpusize = 0; max_reqs = 0; +} +// transfer_cached: reuse pre-allocated buffers from SyncCache +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int req_no = 0; + int node; + + for (node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = length; + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + } + else + { + // send + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + // recv + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + } + } + + MPI_Waitall(req_no, cache.reqs, cache.stats); + + for (node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); +} +// Sync_cached: build grid segment lists on first call, reuse on subsequent calls +void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + // Allocate cache arrays if needed + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + for (int node = 0; node < cpusize; node++) + { + cache.combined_src[node] = cache.combined_dst[node] = 0; + cache.send_lengths[node] = cache.recv_lengths[node] = 0; + } + + // Build intra-patch segments (same as Sync_merged Phase A) + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_ghost) dst_ghost->destroyList(); + Pp = Pp->next; + } + + // Build inter-patch segments (same as Sync_merged Phase B) + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_buffer) dst_buffer->destroyList(); + + cache.valid = true; + } + + // Use cached lists with buffer-reusing transfer + transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache); +} +// Sync_start: pack and post MPI_Isend/Irecv, return immediately +void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state) +{ + // Ensure cache is built + if (!cache.valid) + { + // Build cache (same logic as Sync_cached) + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + for (int node = 0; node < cpusize; node++) + { + cache.combined_src[node] = cache.combined_dst[node] = 0; + cache.send_lengths[node] = cache.recv_lengths[node] = 0; + } + + MyList *Pp = PatL; + while (Pp) + { + Patch *Pat = Pp->data; + MyList *dst_ghost = build_ghost_gsl(Pat); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl0(Pat, node); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_ghost, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_ghost) dst_ghost->destroyList(); + Pp = Pp->next; + } + + MyList *dst_buffer = build_buffer_gsl(PatL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatL, node, 5, Symmetry); + MyList *tsrc = 0, *tdst = 0; + build_gstl(src_owned, dst_buffer, &tsrc, &tdst); + if (tsrc) + { + if (cache.combined_src[node]) + cache.combined_src[node]->catList(tsrc); + else + cache.combined_src[node] = tsrc; + } + if (tdst) + { + if (cache.combined_dst[node]) + cache.combined_dst[node]->catList(tdst); + else + cache.combined_dst[node] = tdst; + } + if (src_owned) src_owned->destroyList(); + } + if (dst_buffer) dst_buffer->destroyList(); + cache.valid = true; + } + + // Now pack and post async MPI operations + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + state.req_no = 0; + state.active = true; + + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length; + if (!cache.lengths_valid) { + length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = length; + } else { + length = cache.recv_lengths[node]; + } + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + } + } + else + { + int slength; + if (!cache.lengths_valid) { + slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + cache.send_lengths[node] = slength; + } else { + slength = cache.send_lengths[node]; + } + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + int rlength; + if (!cache.lengths_valid) { + rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + cache.recv_lengths[node] = rlength; + } else { + rlength = cache.recv_lengths[node]; + } + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); + } + } + } + cache.lengths_valid = true; +} +// Sync_finish: wait for async MPI operations and unpack +void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry) +{ + if (!state.active) + return; + + MPI_Waitall(state.req_no, cache.reqs, cache.stats); + + int cpusize = cache.cpusize; + MyList **src = cache.combined_src; + MyList **dst = cache.combined_dst; + + for (int node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + + state.active = false; +} // collect buffer grid segments or blocks for the periodic boundary condition of given patch // --------------------------------------------------- // |con | |con | @@ -4790,6 +5286,203 @@ void Parallel::OutBdLow2Himix(MyList *PatcL, MyList *PatfL, delete[] transfer_src; delete[] transfer_dst; } + +// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_complete_gsl(PatcL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached +void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache); +} + +// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + if (!cache.valid) + { + int cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + cache.cpusize = cpusize; + + if (!cache.combined_src) + { + cache.combined_src = new MyList *[cpusize]; + cache.combined_dst = new MyList *[cpusize]; + cache.send_lengths = new int[cpusize]; + cache.recv_lengths = new int[cpusize]; + cache.send_bufs = new double *[cpusize]; + cache.recv_bufs = new double *[cpusize]; + cache.send_buf_caps = new int[cpusize]; + cache.recv_buf_caps = new int[cpusize]; + for (int i = 0; i < cpusize; i++) + { + cache.send_bufs[i] = cache.recv_bufs[i] = 0; + cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0; + } + cache.max_reqs = 2 * cpusize; + cache.reqs = new MPI_Request[cache.max_reqs]; + cache.stats = new MPI_Status[cache.max_reqs]; + } + + MyList *dst = build_buffer_gsl(PatfL); + for (int node = 0; node < cpusize; node++) + { + MyList *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry); + build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]); + if (src_owned) src_owned->destroyList(); + } + if (dst) dst->destroyList(); + + cache.valid = true; + } + + // Use transfermix instead of transfer for mix-mode interpolation + int myrank; + MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + int cpusize = cache.cpusize; + + int req_no = 0; + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) + { + int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = length; + if (length > 0) + { + if (length > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[length]; + cache.recv_buf_caps[node] = length; + } + data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + } + } + else + { + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); + } + } + } + + MPI_Waitall(req_no, cache.reqs, cache.stats); + + for (int node = 0; node < cpusize; node++) + if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) + data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); +} + // collect all buffer grid segments or blocks for given patch MyList *Parallel::build_buffer_gsl(Patch *Pat) { diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 12fc356..a6ef351 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -81,6 +81,43 @@ namespace Parallel int Symmetry); void Sync(Patch *Pat, MyList *VarList, int Symmetry); void Sync(MyList *PatL, MyList *VarList, int Symmetry); + void Sync_merged(MyList *PatL, MyList *VarList, int Symmetry); + + struct SyncCache { + bool valid; + int cpusize; + MyList **combined_src; + MyList **combined_dst; + int *send_lengths; + int *recv_lengths; + double **send_bufs; + double **recv_bufs; + int *send_buf_caps; + int *recv_buf_caps; + MPI_Request *reqs; + MPI_Status *stats; + int max_reqs; + bool lengths_valid; + SyncCache(); + void invalidate(); + void destroy(); + }; + + void Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache); + void transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + + struct AsyncSyncState { + int req_no; + bool active; + AsyncSyncState() : req_no(0), active(false) {} + }; + + void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, + SyncCache &cache, AsyncSyncState &state); + void Sync_finish(SyncCache &cache, AsyncSyncState &state, + MyList *VarList, int Symmetry); void OutBdLow2Hi(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); @@ -93,6 +130,15 @@ namespace Parallel void OutBdLow2Himix(MyList *PatcL, MyList *PatfL, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); + void Restrict_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + void OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); + void OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache); void Prolong(Patch *Patc, Patch *Patf, MyList *VarList1 /* source */, MyList *VarList2 /* target */, int Symmetry); diff --git a/AMSS_NCKU_source/TwoPunctures.C b/AMSS_NCKU_source/TwoPunctures.C index 2a9c710..1b6e590 100644 --- a/AMSS_NCKU_source/TwoPunctures.C +++ b/AMSS_NCKU_source/TwoPunctures.C @@ -27,6 +27,7 @@ using namespace std; #endif #include "TwoPunctures.h" +#include TwoPunctures::TwoPunctures(double mp, double mm, double b, double P_plusx, double P_plusy, double P_plusz, @@ -59,6 +60,10 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b, F = dvector(0, ntotal - 1); allocate_derivs(&u, ntotal); allocate_derivs(&v, ntotal); + D1_A = NULL; D2_A = NULL; D1_B = NULL; D2_B = NULL; + DF1_phi = NULL; DF2_phi = NULL; + precompute_derivative_matrices(); + allocate_workspace(); } TwoPunctures::~TwoPunctures() @@ -66,6 +71,13 @@ TwoPunctures::~TwoPunctures() free_dvector(F, 0, ntotal - 1); free_derivs(&u, ntotal); free_derivs(&v, ntotal); + free_workspace(); + if (D1_A) delete[] D1_A; + if (D2_A) delete[] D2_A; + if (D1_B) delete[] D1_B; + if (D2_B) delete[] D2_B; + if (DF1_phi) delete[] DF1_phi; + if (DF2_phi) delete[] DF2_phi; } void TwoPunctures::Solve() @@ -302,7 +314,7 @@ void TwoPunctures::set_initial_guess(derivs v) v.d0[indx] = 0; // set initial guess 0 v.d0[indx] /= (-cos(Pih * (2 * i + 1) / n1) - 1.0); // PRD 70, 064011 (2004) Eq.(5), from u to U } - Derivatives_AB3(nvar, n1, n2, n3, v); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, v); if (0) { debug_file = fopen("initial.dat", "w"); @@ -891,25 +903,17 @@ double TwoPunctures::norm1(double *v, int n) /* -------------------------------------------------------------------------*/ double TwoPunctures::norm2(double *v, int n) { - int i; - double result = 0; - - for (i = 0; i < n; i++) - result += v[i] * v[i]; - - return sqrt(result); + // Optimized with oneMKL BLAS DNRM2 + // Computes: sqrt(sum(v[i]^2)) + return cblas_dnrm2(n, v, 1); } /* -------------------------------------------------------------------------*/ double TwoPunctures::scalarproduct(double *v, double *w, int n) { - int i; - double result = 0; - - for (i = 0; i < n; i++) - result += v[i] * w[i]; - - return result; + // Optimized with oneMKL BLAS DDOT + // Computes: sum(v[i] * w[i]) + return cblas_ddot(n, v, 1, w, 1); } /* -------------------------------------------------------------------------*/ @@ -1291,9 +1295,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, derivs U; double *sources; - values = dvector(0, nvar - 1); - allocate_derivs(&U, nvar); - sources = (double *)calloc(n1 * n2 * n3, sizeof(double)); if (0) { @@ -1350,7 +1351,7 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, for (k = 0; k < n3; k++) sources[Index(0, i, j, k, 1, n1, n2, n3)] = 0.0; - Derivatives_AB3(nvar, n1, n2, n3, v); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, v); double psi, psi2, psi4, psi7, r_plus, r_minus; FILE *debugfile = NULL; if (0) @@ -1358,12 +1359,22 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, debugfile = fopen("res.dat", "w"); assert(debugfile); } + #pragma omp parallel for collapse(3) schedule(dynamic,1) \ + private(i, j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1, \ + psi, psi2, psi4, psi7, r_plus, r_minus) for (i = 0; i < n1; i++) { for (j = 0; j < n2; j++) { for (k = 0; k < n3; k++) { + double l_values[1]; // nvar=1, stack-allocated + derivs l_U; + double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1]; + double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1]; + l_U.d0 = l_U_d0; l_U.d1 = l_U_d1; l_U.d2 = l_U_d2; l_U.d3 = l_U_d3; + l_U.d11 = l_U_d11; l_U.d12 = l_U_d12; l_U.d13 = l_U_d13; + l_U.d22 = l_U_d22; l_U.d23 = l_U_d23; l_U.d33 = l_U_d33; al = Pih * (2 * i + 1) / n1; A = -cos(al); @@ -1375,72 +1386,36 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - U.d0[ivar] = Am1 * v.d0[indx]; /* U*/ - U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx]; /* U_A*/ - U.d2[ivar] = Am1 * v.d2[indx]; /* U_B*/ - U.d3[ivar] = Am1 * v.d3[indx]; /* U_3*/ - U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; /* U_AA*/ - U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx]; /* U_AB*/ - U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx]; /* U_AB*/ - U.d22[ivar] = Am1 * v.d22[indx]; /* U_BB*/ - U.d23[ivar] = Am1 * v.d23[indx]; /* U_B3*/ - U.d33[ivar] = Am1 * v.d33[indx]; /* U_33*/ + l_U.d0[ivar] = Am1 * v.d0[indx]; + l_U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx]; + l_U.d2[ivar] = Am1 * v.d2[indx]; + l_U.d3[ivar] = Am1 * v.d3[indx]; + l_U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; + l_U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx]; + l_U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx]; + l_U.d22[ivar] = Am1 * v.d22[indx]; + l_U.d23[ivar] = Am1 * v.d23[indx]; + l_U.d33[ivar] = Am1 * v.d33[indx]; } - /* Calculation of (X,R) and*/ - /* (U_X, U_R, U_3, U_XX, U_XR, U_X3, U_RR, U_R3, U_33)*/ - AB_To_XR(nvar, A, B, &X, &R, U); - /* Calculation of (x,r) and*/ - /* (U, U_x, U_r, U_3, U_xx, U_xr, U_x3, U_rr, U_r3, U_33)*/ - C_To_c(nvar, X, R, &x, &r, U); - /* Calculation of (y,z) and*/ - /* (U, U_x, U_y, U_z, U_xx, U_xy, U_xz, U_yy, U_yz, U_zz)*/ - rx3_To_xyz(nvar, x, r, phi, &y, &z, U); + AB_To_XR(nvar, A, B, &X, &R, l_U); + C_To_c(nvar, X, R, &x, &r, l_U); + rx3_To_xyz(nvar, x, r, phi, &y, &z, l_U); NonLinEquations(sources[Index(0, i, j, k, 1, n1, n2, n3)], - A, B, X, R, x, r, phi, y, z, U, values); + A, B, X, R, x, r, phi, y, z, l_U, l_values); for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - F[indx] = values[ivar] * FAC; - /* if ((i<5) && ((j<5) || (j>n2-5)))*/ - /* F[indx] = 0.0;*/ - u.d0[indx] = U.d0[ivar]; /* U*/ - u.d1[indx] = U.d1[ivar]; /* U_x*/ - u.d2[indx] = U.d2[ivar]; /* U_y*/ - u.d3[indx] = U.d3[ivar]; /* U_z*/ - u.d11[indx] = U.d11[ivar]; /* U_xx*/ - u.d12[indx] = U.d12[ivar]; /* U_xy*/ - u.d13[indx] = U.d13[ivar]; /* U_xz*/ - u.d22[indx] = U.d22[ivar]; /* U_yy*/ - u.d23[indx] = U.d23[ivar]; /* U_yz*/ - u.d33[indx] = U.d33[ivar]; /* U_zz*/ - } - if (debugfile && (k == 0)) - { - r_plus = sqrt((x - par_b) * (x - par_b) + y * y + z * z); - r_minus = sqrt((x + par_b) * (x + par_b) + y * y + z * z); - psi = 1. + - 0.5 * par_m_plus / r_plus + - 0.5 * par_m_minus / r_minus + - U.d0[0]; - psi2 = psi * psi; - psi4 = psi2 * psi2; - psi7 = psi * psi2 * psi4; - fprintf(debugfile, - "%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g\n", - (double)x, (double)y, (double)A, (double)B, - (double)(U.d11[0] + - U.d22[0] + - U.d33[0] + - /* 0.125 * BY_KKofxyz (x, y, z) / psi7 +*/ - (2.0 * Pi / psi2 / psi * sources[indx]) * FAC), - (double)((U.d11[0] + - U.d22[0] + - U.d33[0]) * - FAC), - (double)(-(2.0 * Pi / psi2 / psi * sources[indx]) * FAC), - (double)sources[indx] - /*(double)F[indx]*/ - ); + F[indx] = l_values[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be); + u.d0[indx] = l_U.d0[ivar]; + u.d1[indx] = l_U.d1[ivar]; + u.d2[indx] = l_U.d2[ivar]; + u.d3[indx] = l_U.d3[ivar]; + u.d11[indx] = l_U.d11[ivar]; + u.d12[indx] = l_U.d12[ivar]; + u.d13[indx] = l_U.d13[ivar]; + u.d22[indx] = l_U.d22[ivar]; + u.d23[indx] = l_U.d23[ivar]; + u.d33[indx] = l_U.d33[ivar]; } } } @@ -1450,8 +1425,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, fclose(debugfile); } free(sources); - free_dvector(values, 0, nvar - 1); - free_derivs(&U, nvar); } /* --------------------------------------------------------------------------*/ double TwoPunctures::norm_inf(double const *F, int const ntotal) @@ -1551,7 +1524,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const for (int j = 0; j < ntotal; j++) ph.d0[j] = 0; for (int j = 0; j < NRELAX; j++) /* solves JFD*ph = p by relaxation*/ - relax(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD); + relax_omp(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD); J_times_dv(nvar, n1, n2, n3, ph, vv, u); /* vv=J*ph*/ alpha = rho / scalarproduct(rt, vv, ntotal); @@ -1577,7 +1550,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const for (int j = 0; j < ntotal; j++) sh.d0[j] = 0; for (int j = 0; j < NRELAX; j++) /* solves JFD*sh = s by relaxation*/ - relax(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD); + relax_omp(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD); J_times_dv(nvar, n1, n2, n3, sh, t, u); /* t=J*sh*/ omega = scalarproduct(t, s, ntotal) / scalarproduct(t, t, ntotal); @@ -1852,21 +1825,32 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl /* (u.d1[], u.d2[], u.d3[], u.d11[], u.d12[], u.d13[], u.d22[], u.d23[], u.d33[])*/ /* at interior points and at the boundaries "+/-"*/ int i, j, k, ivar, indx; - double al, be, A, B, X, R, x, r, phi, y, z, Am1, *values; - derivs dU, U; + double al, be, A, B, X, R, x, r, phi, y, z, Am1; - Derivatives_AB3(nvar, n1, n2, n3, dv); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, dv); + #pragma omp parallel for schedule(dynamic,1) \ + private(j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1) for (i = 0; i < n1; i++) { - values = dvector(0, nvar - 1); - allocate_derivs(&dU, nvar); - allocate_derivs(&U, nvar); + // Thread-local derivs on stack (nvar=1) + double l_val[1]; + double l_dU_d0[1], l_dU_d1[1], l_dU_d2[1], l_dU_d3[1]; + double l_dU_d11[1], l_dU_d12[1], l_dU_d13[1], l_dU_d22[1], l_dU_d23[1], l_dU_d33[1]; + double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1]; + double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1]; + derivs l_dU, l_U; + l_dU.d0=l_dU_d0; l_dU.d1=l_dU_d1; l_dU.d2=l_dU_d2; l_dU.d3=l_dU_d3; + l_dU.d11=l_dU_d11; l_dU.d12=l_dU_d12; l_dU.d13=l_dU_d13; + l_dU.d22=l_dU_d22; l_dU.d23=l_dU_d23; l_dU.d33=l_dU_d33; + l_U.d0=l_U_d0; l_U.d1=l_U_d1; l_U.d2=l_U_d2; l_U.d3=l_U_d3; + l_U.d11=l_U_d11; l_U.d12=l_U_d12; l_U.d13=l_U_d13; + l_U.d22=l_U_d22; l_U.d23=l_U_d23; l_U.d33=l_U_d33; + for (j = 0; j < n2; j++) { for (k = 0; k < n3; k++) { - al = Pih * (2 * i + 1) / n1; A = -cos(al); be = Pih * (2 * j + 1) / n2; @@ -1877,104 +1861,193 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - dU.d0[ivar] = Am1 * dv.d0[indx]; /* dU*/ - dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx]; /* dU_A*/ - dU.d2[ivar] = Am1 * dv.d2[indx]; /* dU_B*/ - dU.d3[ivar] = Am1 * dv.d3[indx]; /* dU_3*/ - dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx]; /* dU_AA*/ - dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx]; /* dU_AB*/ - dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx]; /* dU_AB*/ - dU.d22[ivar] = Am1 * dv.d22[indx]; /* dU_BB*/ - dU.d23[ivar] = Am1 * dv.d23[indx]; /* dU_B3*/ - dU.d33[ivar] = Am1 * dv.d33[indx]; /* dU_33*/ - U.d0[ivar] = u.d0[indx]; /* U */ - U.d1[ivar] = u.d1[indx]; /* U_x*/ - U.d2[ivar] = u.d2[indx]; /* U_y*/ - U.d3[ivar] = u.d3[indx]; /* U_z*/ - U.d11[ivar] = u.d11[indx]; /* U_xx*/ - U.d12[ivar] = u.d12[indx]; /* U_xy*/ - U.d13[ivar] = u.d13[indx]; /* U_xz*/ - U.d22[ivar] = u.d22[indx]; /* U_yy*/ - U.d23[ivar] = u.d23[indx]; /* U_yz*/ - U.d33[ivar] = u.d33[indx]; /* U_zz*/ + l_dU.d0[ivar] = Am1 * dv.d0[indx]; + l_dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx]; + l_dU.d2[ivar] = Am1 * dv.d2[indx]; + l_dU.d3[ivar] = Am1 * dv.d3[indx]; + l_dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx]; + l_dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx]; + l_dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx]; + l_dU.d22[ivar] = Am1 * dv.d22[indx]; + l_dU.d23[ivar] = Am1 * dv.d23[indx]; + l_dU.d33[ivar] = Am1 * dv.d33[indx]; + l_U.d0[ivar] = u.d0[indx]; + l_U.d1[ivar] = u.d1[indx]; + l_U.d2[ivar] = u.d2[indx]; + l_U.d3[ivar] = u.d3[indx]; + l_U.d11[ivar] = u.d11[indx]; + l_U.d12[ivar] = u.d12[indx]; + l_U.d13[ivar] = u.d13[indx]; + l_U.d22[ivar] = u.d22[indx]; + l_U.d23[ivar] = u.d23[indx]; + l_U.d33[ivar] = u.d33[indx]; } - /* Calculation of (X,R) and*/ - /* (dU_X, dU_R, dU_3, dU_XX, dU_XR, dU_X3, dU_RR, dU_R3, dU_33)*/ - AB_To_XR(nvar, A, B, &X, &R, dU); - /* Calculation of (x,r) and*/ - /* (dU, dU_x, dU_r, dU_3, dU_xx, dU_xr, dU_x3, dU_rr, dU_r3, dU_33)*/ - C_To_c(nvar, X, R, &x, &r, dU); - /* Calculation of (y,z) and*/ - /* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/ - rx3_To_xyz(nvar, x, r, phi, &y, &z, dU); - LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values); + AB_To_XR(nvar, A, B, &X, &R, l_dU); + C_To_c(nvar, X, R, &x, &r, l_dU); + rx3_To_xyz(nvar, x, r, phi, &y, &z, l_dU); + LinEquations(A, B, X, R, x, r, phi, y, z, l_dU, l_U, l_val); for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - Jdv[indx] = values[ivar] * FAC; + Jdv[indx] = l_val[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be); } } } - free_dvector(values, 0, nvar - 1); - free_derivs(&dU, nvar); - free_derivs(&U, nvar); } } /* --------------------------------------------------------------------------*/ -void TwoPunctures::relax(double *dv, int const nvar, int const n1, int const n2, int const n3, +/* -------------------------------------------------------------------------- + * relax_omp: OpenMP-parallelized replacement for relax() + * + * Parallelism analysis: + * - The red-black ordering within each phi-plane means that + * same-parity lines in the i-direction are INDEPENDENT of each other + * (they only couple through the j-direction which is solved internally). + * - Similarly, same-parity lines in the j-direction are independent. + * - Different phi-planes (k) with same parity are independent. + * + * Strategy: + * - Parallelize the i-loop within each (k, parity) group for LineRelax_be + * - Parallelize the j-loop within each (k, parity) group for LineRelax_al + * - Each thread uses its own pre-allocated workspace (tid-indexed) + * --------------------------------------------------------------------------*/ +void TwoPunctures::relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, int **cols, double **JFD) { - int i, j, k, n; + int n; - for (k = 0; k < n3; k = k + 2) - { + // 偶数k平面 for (n = 0; n < N_PlaneRelax; n++) { - for (i = 2; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (i = 1; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 1; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 0; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); + // 偶数i线,所有偶数k —— 不同k平面完全独立 + int n_even_k = (n3 + 1) / 2; // 偶数k的数量 + int n_even_i = (n1 - 2 + 1) / 2; // i=2,4,...的数量 + int total_tasks = n_even_k * n_even_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_i; + int ii = task % n_even_i; + int k = ki * 2; + int i = 2 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数i线,所有偶数k + int n_odd_i = n1 / 2; // i=1,3,...的数量 + total_tasks = n_even_k * n_odd_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_i; + int ii = task % n_odd_i; + int k = ki * 2; + int i = 1 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数j线,所有偶数k + int n_odd_j = (n2 - 1 + 1) / 2; + total_tasks = n_even_k * n_odd_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_j; + int ji = task % n_odd_j; + int k = ki * 2; + int j = 1 + ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 偶数j线,所有偶数k + int n_even_j = (n2 + 1) / 2; + total_tasks = n_even_k * n_even_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_j; + int ji = task % n_even_j; + int k = ki * 2; + int j = ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数k平面 — 同样的四步 + int n_odd_k = n3 / 2; + + // 偶数i线,所有奇数k + n_even_i = (n1 + 1) / 2; // i=0,2,... + total_tasks = n_odd_k * n_even_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_i; + int ii = task % n_even_i; + int k = 1 + ki * 2; + int i = ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数i线,所有奇数k + total_tasks = n_odd_k * n_odd_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_i; + int ii = task % n_odd_i; + int k = 1 + ki * 2; + int i = 1 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数j线,所有奇数k + total_tasks = n_odd_k * n_odd_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_j; + int ji = task % n_odd_j; + int k = 1 + ki * 2; + int j = 1 + ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 偶数j线,所有奇数k + total_tasks = n_odd_k * n_even_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_j; + int ji = task % n_even_j; + int k = 1 + ki * 2; + int j = ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } } - } - for (k = 1; k < n3; k = k + 2) - { - for (n = 0; n < N_PlaneRelax; n++) - { - for (i = 0; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (i = 1; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 1; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 0; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - } - } } /* --------------------------------------------------------------------------*/ -void TwoPunctures::LineRelax_be(double *dv, +void TwoPunctures::LineRelax_be_omp(double *dv, int const i, int const k, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, int **cols, - double **JFD) + double **JFD, int tid) { int j, m, Ic, Ip, Im, col, ivar; - double *diag = new double[n2]; - double *e = new double[n2 - 1]; /* above diagonal */ - double *f = new double[n2 - 1]; /* below diagonal */ - double *b = new double[n2]; /* rhs */ - double *x = new double[n2]; /* solution vector */ - - // gsl_vector *diag = gsl_vector_alloc(n2); - // gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */ - // gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */ - // gsl_vector *b = gsl_vector_alloc(n2); /* rhs */ - // gsl_vector *x = gsl_vector_alloc(n2); /* solution vector */ + // Use pre-allocated per-thread workspace instead of new/delete + double *diag = ws_diag_be[tid]; + double *e = ws_e_be[tid]; + double *f = ws_f_be[tid]; + double *b = ws_b_be[tid]; + double *x = ws_x_be[tid]; for (ivar = 0; ivar < nvar; ivar++) { @@ -2014,14 +2087,8 @@ void TwoPunctures::LineRelax_be(double *dv, } } } - // A x = b - // A = ( d_0 e_0 0 0 ) - // ( f_0 d_1 e_1 0 ) - // ( 0 f_1 d_2 e_2 ) - // ( 0 0 f_2 d_3 ) - // - ThomasAlgorithm(n2, f, diag, e, x, b); - // gsl_linalg_solve_tridiag(diag, e, f, b, x); + ThomasAlgorithm_ws(n2, f, diag, e, x, b, + ws_l_be[tid], ws_u_be[tid], ws_d_be[tid], ws_y_be[tid]); for (j = 0; j < n2; j++) { Ic = Index(ivar, i, j, k, nvar, n1, n2, n3); @@ -2029,17 +2096,7 @@ void TwoPunctures::LineRelax_be(double *dv, // dv[Ic] = gsl_vector_get(x, j); } } - - delete[] diag; - delete[] e; - delete[] f; - delete[] b; - delete[] x; - // gsl_vector_free(diag); - // gsl_vector_free(e); - // gsl_vector_free(f); - // gsl_vector_free(b); - // gsl_vector_free(x); + // No delete — workspace is persistent } /* --------------------------------------------------------------------------*/ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, @@ -2054,10 +2111,19 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, double sin_be, sin_be_i1, sin_be_i2, sin_be_i3, cos_be; double dV0, dV1, dV2, dV3, dV11, dV12, dV13, dV22, dV23, dV33, ha, ga, ga2, hb, gb, gb2, hp, gp, gp2, gagb, gagp, gbgp; - derivs dU, U; - allocate_derivs(&dU, nvar); - allocate_derivs(&U, nvar); + // Stack-allocated derivs (nvar=1) — no malloc/free! + double dU_d0[1], dU_d1[1], dU_d2[1], dU_d3[1]; + double dU_d11[1], dU_d12[1], dU_d13[1], dU_d22[1], dU_d23[1], dU_d33[1]; + double U_d0[1], U_d1[1], U_d2[1], U_d3[1]; + double U_d11[1], U_d12[1], U_d13[1], U_d22[1], U_d23[1], U_d33[1]; + derivs dU, U; + dU.d0=dU_d0; dU.d1=dU_d1; dU.d2=dU_d2; dU.d3=dU_d3; + dU.d11=dU_d11; dU.d12=dU_d12; dU.d13=dU_d13; + dU.d22=dU_d22; dU.d23=dU_d23; dU.d33=dU_d33; + U.d0=U_d0; U.d1=U_d1; U.d2=U_d2; U.d3=U_d3; + U.d11=U_d11; U.d12=U_d12; U.d13=U_d13; + U.d22=U_d22; U.d23=U_d23; U.d33=U_d33; if (k < 0) k = k + n3; @@ -2125,12 +2191,9 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, dV11 = ga2 * (dv.d0[ipcc] + dv.d0[imcc] - 2 * dv.d0[iccc]); dV22 = gb2 * (dv.d0[icpc] + dv.d0[icmc] - 2 * dv.d0[iccc]); dV33 = gp2 * (dv.d0[iccp] + dv.d0[iccm] - 2 * dv.d0[iccc]); - dV12 = - 0.25 * gagb * (dv.d0[ippc] - dv.d0[ipmc] + dv.d0[immc] - dv.d0[impc]); - dV13 = - 0.25 * gagp * (dv.d0[ipcp] - dv.d0[imcp] + dv.d0[imcm] - dv.d0[ipcm]); - dV23 = - 0.25 * gbgp * (dv.d0[icpp] - dv.d0[icpm] + dv.d0[icmm] - dv.d0[icmp]); + dV12 = 0.25 * gagb * (dv.d0[ippc] - dv.d0[ipmc] + dv.d0[immc] - dv.d0[impc]); + dV13 = 0.25 * gagp * (dv.d0[ipcp] - dv.d0[imcp] + dv.d0[imcm] - dv.d0[ipcm]); + dV23 = 0.25 * gbgp * (dv.d0[icpp] - dv.d0[icpm] + dv.d0[icmm] - dv.d0[icmp]); /* Derivatives of (dv) w.r.t. (A,B,phi):*/ dV11 = sin_al_i3 * (sin_al * dV11 - cos_al * dV1); dV12 = sin_al_i1 * sin_be_i1 * dV12; @@ -2173,11 +2236,12 @@ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, /* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/ rx3_To_xyz(nvar, x, r, phi, &y, &z, dU); LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values); - for (ivar = 0; ivar < nvar; ivar++) - values[ivar] *= FAC; - free_derivs(&dU, nvar); - free_derivs(&U, nvar); + double FAC_val = sin_al * sin_be * sin_al * sin_be * sin_al * sin_be; + for (ivar = 0; ivar < nvar; ivar++) + values[ivar] *= FAC_val; + + // No free_derivs needed — everything is on the stack } #undef FAC /*-----------------------------------------------------------*/ @@ -2201,25 +2265,19 @@ void TwoPunctures::LinEquations(double A, double B, double X, double R, values[0] = dU.d11[0] + dU.d22[0] + dU.d33[0] - 0.875 * BY_KKofxyz(x, y, z) / psi8 * dU.d0[0]; } /* -------------------------------------------------------------------------*/ -void TwoPunctures::LineRelax_al(double *dv, +void TwoPunctures::LineRelax_al_omp(double *dv, int const j, int const k, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, - int **cols, double **JFD) + int **cols, double **JFD, int tid) { int i, m, Ic, Ip, Im, col, ivar; - double *diag = new double[n1]; - double *e = new double[n1 - 1]; /* above diagonal */ - double *f = new double[n1 - 1]; /* below diagonal */ - double *b = new double[n1]; /* rhs */ - double *x = new double[n1]; /* solution vector */ - - // gsl_vector *diag = gsl_vector_alloc(n1); - // gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */ - // gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */ - // gsl_vector *b = gsl_vector_alloc(n1); /* rhs */ - // gsl_vector *x = gsl_vector_alloc(n1); /* solution vector */ + double *diag = ws_diag_al[tid]; + double *e = ws_e_al[tid]; + double *f = ws_f_al[tid]; + double *b = ws_b_al[tid]; + double *x = ws_x_al[tid]; for (ivar = 0; ivar < nvar; ivar++) { @@ -2259,8 +2317,8 @@ void TwoPunctures::LineRelax_al(double *dv, } } } - ThomasAlgorithm(n1, f, diag, e, x, b); - // gsl_linalg_solve_tridiag(diag, e, f, b, x); + ThomasAlgorithm_ws(n1, f, diag, e, x, b, + ws_l_al[tid], ws_u_al[tid], ws_d_al[tid], ws_y_al[tid]); for (i = 0; i < n1; i++) { Ic = Index(ivar, i, j, k, nvar, n1, n2, n3); @@ -2268,18 +2326,6 @@ void TwoPunctures::LineRelax_al(double *dv, // dv[Ic] = gsl_vector_get(x, i); } } - - delete[] diag; - delete[] e; - delete[] f; - delete[] b; - delete[] x; - - // gsl_vector_free(diag); - // gsl_vector_free(e); - // gsl_vector_free(f); - // gsl_vector_free(b); - // gsl_vector_free(x); } /* -------------------------------------------------------------------------*/ // a[N], b[N-1], c[N-1], x[N], q[N] @@ -2330,6 +2376,37 @@ void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, doubl return; } + +// ThomasAlgorithm with pre-allocated workspace (no new/delete) +// l[N-1], u_ws[N-1], d[N], y[N] are caller-provided workspace +void TwoPunctures::ThomasAlgorithm_ws(int N, double *b, double *a, double *c, + double *x, double *q, + double *l, double *u_ws, double *d, double *y) +{ + /* LU Decomposition */ + d[0] = a[0]; + u_ws[0] = c[0]; + + for (int i = 0; i < N - 2; i++) { + l[i] = b[i] / d[i]; + d[i + 1] = a[i + 1] - l[i] * u_ws[i]; + u_ws[i + 1] = c[i + 1]; + } + + l[N - 2] = b[N - 2] / d[N - 2]; + d[N - 1] = a[N - 1] - l[N - 2] * u_ws[N - 2]; + + /* Forward Substitution [L][y] = [q] */ + y[0] = q[0]; + for (int i = 1; i < N; i++) + y[i] = q[i] - l[i - 1] * y[i - 1]; + + /* Backward Substitution [U][x] = [y] */ + x[N - 1] = y[N - 1] / d[N - 1]; + for (int i = N - 2; i >= 0; i--) + x[i] = (y[i] - u_ws[i] * x[i + 1]) / d[i]; +} + // --------------------------------------------------------------------------*/ // Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/ /* --------------------------------------------------------------------------*/ @@ -2519,3 +2596,606 @@ void TwoPunctures::SpecCoef(parameters par, int ivar, double *v, double *cf) free_d3tensor(values3, 0, n1, 0, n2, 0, n3); free_d3tensor(values4, 0, n1, 0, n2, 0, n3); } + +void TwoPunctures::allocate_workspace() +{ + int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi; + max_threads = omp_get_max_threads(); + printf("Allocating workspace for %d threads\n", max_threads); + + // LineRelax_be workspace: arrays of size n2, per thread + ws_diag_be = new double*[max_threads]; + ws_e_be = new double*[max_threads]; + ws_f_be = new double*[max_threads]; + ws_b_be = new double*[max_threads]; + ws_x_be = new double*[max_threads]; + ws_l_be = new double*[max_threads]; + ws_u_be = new double*[max_threads]; + ws_d_be = new double*[max_threads]; + ws_y_be = new double*[max_threads]; + + // LineRelax_al workspace: arrays of size n1, per thread + ws_diag_al = new double*[max_threads]; + ws_e_al = new double*[max_threads]; + ws_f_al = new double*[max_threads]; + ws_b_al = new double*[max_threads]; + ws_x_al = new double*[max_threads]; + ws_l_al = new double*[max_threads]; + ws_u_al = new double*[max_threads]; + ws_d_al = new double*[max_threads]; + ws_y_al = new double*[max_threads]; + + int N = (n1 > n2) ? n1 : n2; // max of n1, n2 + + for (int t = 0; t < max_threads; t++) { + ws_diag_be[t] = new double[n2]; + ws_e_be[t] = new double[n2]; + ws_f_be[t] = new double[n2]; + ws_b_be[t] = new double[n2]; + ws_x_be[t] = new double[n2]; + ws_l_be[t] = new double[n2]; + ws_u_be[t] = new double[n2]; + ws_d_be[t] = new double[n2]; + ws_y_be[t] = new double[n2]; + + ws_diag_al[t] = new double[n1]; + ws_e_al[t] = new double[n1]; + ws_f_al[t] = new double[n1]; + ws_b_al[t] = new double[n1]; + ws_x_al[t] = new double[n1]; + ws_l_al[t] = new double[n1]; + ws_u_al[t] = new double[n1]; + ws_d_al[t] = new double[n1]; + ws_y_al[t] = new double[n1]; + } +} + +void TwoPunctures::free_workspace() +{ + for (int t = 0; t < max_threads; t++) { + delete[] ws_diag_be[t]; delete[] ws_e_be[t]; delete[] ws_f_be[t]; + delete[] ws_b_be[t]; delete[] ws_x_be[t]; + delete[] ws_l_be[t]; delete[] ws_u_be[t]; + delete[] ws_d_be[t]; delete[] ws_y_be[t]; + + delete[] ws_diag_al[t]; delete[] ws_e_al[t]; delete[] ws_f_al[t]; + delete[] ws_b_al[t]; delete[] ws_x_al[t]; + delete[] ws_l_al[t]; delete[] ws_u_al[t]; + delete[] ws_d_al[t]; delete[] ws_y_al[t]; + } + delete[] ws_diag_be; delete[] ws_e_be; delete[] ws_f_be; + delete[] ws_b_be; delete[] ws_x_be; + delete[] ws_l_be; delete[] ws_u_be; + delete[] ws_d_be; delete[] ws_y_be; + + delete[] ws_diag_al; delete[] ws_e_al; delete[] ws_f_al; + delete[] ws_b_al; delete[] ws_x_al; + delete[] ws_l_al; delete[] ws_u_al; + delete[] ws_d_al; delete[] ws_y_al; +} + +/*========================================================================== + * Precomputed Spectral Derivative Matrices + * + * Mathematical equivalence proof: + * + * Original algorithm (per-line): + * 1. Forward Chebyshev transform: c = T * f (where T is the DCT matrix) + * 2. Spectral derivative: c' = Dhat * c (recurrence relation) + * 3. Inverse transform: f' = T^{-1} * c' + * Combined: f' = T^{-1} * Dhat * T * f = D * f + * + * The matrix D = T^{-1} * Dhat * T is precomputed once. + * Similarly D2 = T^{-1} * Dhat^2 * T for second derivatives. + * + * For Fourier: same idea with DFT matrices and frequency-domain derivatives. + * + * This converts n2*n3 separate O(n1^2) transforms into a single + * (n1 x n1) * (n1 x n2*n3) DGEMM call, which is BLAS Level-3 + * and thus optimally parallelized by MKL. + *=========================================================================*/ + +void TwoPunctures::build_cheb_deriv_matrices(int n, double *D1, double *D2) +{ + /* Build the physical-space derivative matrices for Chebyshev Zeros grid. + * + * Grid points: x_i = -cos(pi*(2i+1)/(2n)), i=0,...,n-1 + * + * Method: Construct T (forward transform), Dhat (spectral derivative), + * T^{-1} (inverse transform), then D1 = T^{-1} * Dhat * T, + * D2 = T^{-1} * Dhat^2 * T. + * + * All matrices are n x n, stored in row-major order: M[i*n+j] + */ + + double *T_fwd = new double[n * n]; // Forward transform matrix + double *T_inv = new double[n * n]; // Inverse transform matrix + double *Dhat = new double[n * n]; // Spectral derivative operator + double *Dhat2 = new double[n * n]; // Spectral second derivative operator + double *tmp1 = new double[n * n]; // Temporary + double *tmp2 = new double[n * n]; // Temporary + + double Pion = Pi / n; + + // Build forward Chebyshev transform matrix T + // c_j = (2/n) * (-1)^j * sum_k f_k * cos(pi*j*(k+0.5)/n) + // So T[j][k] = (2/n) * (-1)^j * cos(pi*j*(k+0.5)/n) + for (int j = 0; j < n; j++) { + double fac = (2.0 / n) * ((j % 2 == 0) ? 1.0 : -1.0); + for (int k = 0; k < n; k++) { + T_fwd[j * n + k] = fac * cos(Pion * j * (k + 0.5)); + } + } + + // Build inverse Chebyshev transform matrix T^{-1} + // f_j = sum_k c_k * cos(pi*(j+0.5)*k/n) * (-1)^k - 0.5*c_0 + // But the -0.5*c_0 term is part of the sum when we write it as: + // f_j = -0.5*c_0 + sum_{k=0}^{n-1} c_k * cos(pi*(j+0.5)*k) * (-1)^k + // T_inv[j][k] = cos(pi*(j+0.5)*k/n) * (-1)^k, with k=0 term having extra -0.5 + for (int j = 0; j < n; j++) { + for (int k = 0; k < n; k++) { + double sign_k = (k % 2 == 0) ? 1.0 : -1.0; + T_inv[j * n + k] = cos(Pion * (j + 0.5) * k) * sign_k; + } + // The k=0 term needs adjustment: the sum includes c_0*1 but we need -0.5*c_0 + c_0*1 = 0.5*c_0 + // Wait, let me re-examine chebft_Zeros with inv=1: + // sum = -0.5 * u[0]; + // for k: sum += u[k] * cos(Pion*(j+0.5)*k) * isignum; isignum alternates starting from 1 + // So: c[j] = -0.5*u[0] + sum_{k=0}^{n-1} u[k]*cos(...)*(-1)^k + // = -0.5*u[0] + u[0]*1*1 + sum_{k=1} ... + // = 0.5*u[0] + sum_{k=1} u[k]*cos(...)*(-1)^k + // Equivalently: T_inv[j][0] = 0.5, T_inv[j][k] = cos(...)*(-1)^k for k>=1 + // But cos(0) = 1 and (-1)^0 = 1, so the formula gives T_inv[j][0] = 1.0 + // We need it to be 0.5. Fix: + T_inv[j * n + 0] = 0.5; // This accounts for the -0.5*u[0] + u[0]*cos(0)*1 = 0.5*u[0] + } + + // Build spectral derivative matrix Dhat (in coefficient space) + // The recurrence: cder[n-1] = 0, cder[n-2] = 0, + // cder[j] = cder[j+2] + 2*(j+1)*c[j+1] for j = n-3,...,0 + // This means cder = Dhat * c, where Dhat is upper triangular-ish. + // Dhat[j][k] = coefficient of c[k] contributing to cder[j] + // + // From the recurrence: cder[j] = sum_{k=j+1, k-j odd}^{n-1} 2*k * c[k] + // (with the factor 2k, summing over k > j where k-j is odd) + // Exception: cder[0] gets an extra factor of 0.5 since c[0] has the 2/n prefactor + // Actually no: the chder function is: + // cder[n] = cder[n-1] = 0 + // cder[j] = cder[j+2] + 2*(j+1)*c[j+1] + // Unrolling: cder[j] = 2*(j+1)*c[j+1] + 2*(j+3)*c[j+3] + ... + // So Dhat[j][k] = 2*k if k > j and (k-j) is odd, else 0 + + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + Dhat[j * n + k] = 0.0; + + for (int j = 0; j < n; j++) { + for (int k = j + 1; k < n; k++) { + if ((k - j) % 2 == 1) { + Dhat[j * n + k] = 2.0 * k; + } + } + } + + // Build Dhat^2 = Dhat * Dhat + // D1 = T_inv * Dhat * T_fwd + // D2 = T_inv * Dhat^2 * T_fwd + + // tmp1 = Dhat * T_fwd + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, Dhat, n, T_fwd, n, 0.0, tmp1, n); + // D1 = T_inv * tmp1 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D1, n); + + // tmp2 = Dhat * Dhat (Dhat^2 in spectral space) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, Dhat, n, Dhat, n, 0.0, tmp2, n); + // tmp1 = Dhat^2 * T_fwd + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, tmp2, n, T_fwd, n, 0.0, tmp1, n); + // D2 = T_inv * tmp1 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D2, n); + + delete[] T_fwd; + delete[] T_inv; + delete[] Dhat; + delete[] Dhat2; + delete[] tmp1; + delete[] tmp2; +} + +void TwoPunctures::build_fourier_deriv_matrices(int N, double *DF1, double *DF2) +{ + /* Build Fourier derivative matrices in physical space. + * + * Grid: phi_k = 2*pi*k/N, k=0,...,N-1 + * + * The Fourier interpolant derivative at grid points can be expressed as + * a matrix multiply. We build it by: + * 1. Forward Fourier transform matrix F + * 2. Frequency-domain derivative (multiply by il for first, -l^2 for second) + * 3. Inverse Fourier transform matrix F^{-1} + * DF1 = F^{-1} * diag(il) * F, DF2 = F^{-1} * diag(-l^2) * F + * + * But since fourft/fourev use a real representation (a_l, b_l), + * we construct directly in physical space. + */ + + int M = N / 2; + double Pi_fac = Pi / M; // = 2*Pi/N + + // DF1[j][k] = d/dphi of the interpolant at phi_j, due to value at phi_k + // Using the representation: + // f(phi) = 0.5*(a_0 + a_M*cos(M*phi)) + sum_{l=1}^{M-1} (a_l*cos(l*phi) + b_l*sin(l*phi)) + // where a_l = (2/N)*sum_k f_k*cos(l*phi_k), b_l = (2/N)*sum_k f_k*sin(l*phi_k) + // + // f'(phi) = -0.5*a_M*M*sin(M*phi) + sum_{l=1}^{M-1} l*(-a_l*sin(l*phi) + b_l*cos(l*phi)) + // + // Substituting a_l, b_l and evaluating at phi_j: + // f'(phi_j) = sum_k f_k * K(j,k) + // where K(j,k) = (2/N) * sum_{l=1}^{M-1} l * (-cos(l*phi_k)*sin(l*phi_j) + sin(l*phi_k)*cos(l*phi_j)) + // + (2/N) * (-M/2) * sin(M*phi_j) * cos(M*phi_k) [a_M term, note a_M has no factor 2] + // = (2/N) * sum_{l=1}^{M-1} l * sin(l*(phi_k - phi_j)) + // - (1/N) * M * sin(M*phi_j) * cos(M*phi_k) + // + // But the a_M coefficient in fourft has factor 1/M (not 2/M), so: + // Actually re-examining fourft: a[l] = fac * sum_k u[k]*cos(x), fac=1/M + // and a_M is stored as a[M] with same fac. The inverse uses: + // u[k] = 0.5*(a[0] + a[M]*iy) + sum_{l=1}^{M-1}(a[l]*cos + b[l]*sin) + // So the full expression needs care. Let me just compute it numerically. + + // Numerical approach: for each k, set f = delta_k, compute derivative at all j + double *p = new double[N]; + double *dp = new double[N]; + + for (int k = 0; k < N; k++) { + // Set delta function at k + for (int i = 0; i < N; i++) + p[i] = (i == k) ? 1.0 : 0.0; + + // Forward Fourier transform (using existing fourft) + fourft(p, N, 0); + // Derivative in spectral space + fourder(p, dp, N); + // Inverse Fourier transform + fourft(dp, N, 1); + + // dp[j] = derivative of delta_k interpolant at phi_j + // So DF1[j][k] = dp[j] + for (int j = 0; j < N; j++) + DF1[j * N + k] = dp[j]; + } + + // Second derivative + for (int k = 0; k < N; k++) { + for (int i = 0; i < N; i++) + p[i] = (i == k) ? 1.0 : 0.0; + + fourft(p, N, 0); + fourder2(p, dp, N); + fourft(dp, N, 1); + + for (int j = 0; j < N; j++) + DF2[j * N + k] = dp[j]; + } + + delete[] p; + delete[] dp; +} + +void TwoPunctures::precompute_derivative_matrices() +{ + int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi; + + // Allocate matrices + D1_A = new double[n1 * n1]; + D2_A = new double[n1 * n1]; + D1_B = new double[n2 * n2]; + D2_B = new double[n2 * n2]; + DF1_phi = new double[n3 * n3]; + DF2_phi = new double[n3 * n3]; + + // Build Chebyshev derivative matrices + build_cheb_deriv_matrices(n1, D1_A, D2_A); + build_cheb_deriv_matrices(n2, D1_B, D2_B); + + // Build Fourier derivative matrices + build_fourier_deriv_matrices(n3, DF1_phi, DF2_phi); + + printf("Precomputed derivative matrices: A(%d), B(%d), phi(%d)\n", n1, n2, n3); +} + +/* -------------------------------------------------------------------------- + * Derivatives_AB3_MatMul: Drop-in replacement for Derivatives_AB3 + * + * Uses precomputed derivative matrices and DGEMM to compute all spectral + * derivatives in batch. Mathematically equivalent to the original + * Derivatives_AB3. + * + * Memory layout of v.d0[Index(ivar,i,j,k)] = v.d0[ivar + nvar*(i + n1*(j + n2*k))] + * + * For A-direction derivatives (fixed j,k, varying i): + * We need to apply D1_A and D2_A to "pencils" along the i-direction. + * Collect all pencils into a matrix and use DGEMM. + * + * For B-direction derivatives (fixed i,k, varying j): + * Similarly with D1_B, D2_B. + * + * For phi-direction (fixed i,j, varying k): + * Similarly with DF1_phi, DF2_phi. + * --------------------------------------------------------------------------*/ +void TwoPunctures::Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v) +{ + int total_pencils; + double *data_in, *data_out; + + /*===================================================== + * STEP 1: A-direction derivatives (Chebyshev, D1_A, D2_A) + * + * For each (ivar, j, k), we have a pencil of length n1: + * f[i] = v.d0[Index(ivar, i, j, k, nvar, n1, n2, n3)] + * + * We want: v.d1[...] = D1_A * f, v.d11[...] = D2_A * f + * + * Collect all n2*n3*nvar pencils as columns of a matrix: + * data_in[i, col] where col = ivar + nvar*(j + n2*k) + * Then: data_out = D1_A * data_in (DGEMM: n1 x n1 times n1 x total_pencils) + *=====================================================*/ + total_pencils = nvar * n2 * n3; + + data_in = new double[n1 * total_pencils]; + data_out = new double[n1 * total_pencils]; + + // Gather: data_in[i * total_pencils + col] = v.d0[Index(ivar,i,j,k,...)] + // where col = ivar + nvar * (j + n2 * k) + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[i * total_pencils + col] = v.d0[indx]; + } + } + } + } + + // First derivative: data_out = D1_A * data_in + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n1, total_pencils, n1, + 1.0, D1_A, n1, data_in, total_pencils, + 0.0, data_out, total_pencils); + + // Scatter to v.d1 + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d1[indx] = data_out[i * total_pencils + col]; + } + } + } + } + + // Second derivative: data_out = D2_A * data_in + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n1, total_pencils, n1, + 1.0, D2_A, n1, data_in, total_pencils, + 0.0, data_out, total_pencils); + + // Scatter to v.d11 + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d11[indx] = data_out[i * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + + /*===================================================== + * STEP 2: B-direction derivatives (Chebyshev, D1_B, D2_B) + * + * Pencils along j for each (ivar, i, k). + * Also compute mixed derivative v.d12 = D1_B applied to v.d1 + *=====================================================*/ + total_pencils = nvar * n1 * n3; + + data_in = new double[n2 * total_pencils]; + data_out = new double[n2 * total_pencils]; + double *data_in2 = new double[n2 * total_pencils]; + double *data_out2 = new double[n2 * total_pencils]; + + // Gather v.d0 along B-direction AND v.d1 for mixed derivative + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[j * total_pencils + col] = v.d0[indx]; + data_in2[j * total_pencils + col] = v.d1[indx]; // for d/dB of (dv/dA) + } + } + } + } + + // v.d2 = D1_B * v.d0 (along B) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D1_B, n2, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d2[indx] = data_out[j * total_pencils + col]; + } + } + } + } + + // v.d22 = D2_B * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D2_B, n2, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d22[indx] = data_out[j * total_pencils + col]; + } + } + } + } + + // v.d12 = D1_B * v.d1 (mixed: d/dB of dv/dA) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D1_B, n2, data_in2, total_pencils, + 0.0, data_out2, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d12[indx] = data_out2[j * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + delete[] data_in2; + delete[] data_out2; + + /*===================================================== + * STEP 3: phi-direction derivatives (Fourier, DF1_phi, DF2_phi) + * + * Pencils along k for each (ivar, i, j). + * Also compute mixed derivatives v.d13, v.d23 + *=====================================================*/ + total_pencils = nvar * n1 * n2; + + data_in = new double[n3 * total_pencils]; + data_out = new double[n3 * total_pencils]; + data_in2 = new double[n3 * total_pencils]; // for v.d1 → v.d13 + data_out2 = new double[n3 * total_pencils]; + double *data_in3 = new double[n3 * total_pencils]; // for v.d2 → v.d23 + double *data_out3 = new double[n3 * total_pencils]; + + // Gather v.d0, v.d1, v.d2 along phi-direction + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[k * total_pencils + col] = v.d0[indx]; + data_in2[k * total_pencils + col] = v.d1[indx]; + data_in3[k * total_pencils + col] = v.d2[indx]; + } + } + } + } + + // v.d3 = DF1_phi * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d3[indx] = data_out[k * total_pencils + col]; + } + } + } + } + + // v.d33 = DF2_phi * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF2_phi, n3, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d33[indx] = data_out[k * total_pencils + col]; + } + } + } + } + + // v.d13 = DF1_phi * v.d1 (mixed: d/dphi of dv/dA) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in2, total_pencils, + 0.0, data_out2, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d13[indx] = data_out2[k * total_pencils + col]; + } + } + } + } + + // v.d23 = DF1_phi * v.d2 (mixed: d/dphi of dv/dB) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in3, total_pencils, + 0.0, data_out3, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d23[indx] = data_out3[k * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + delete[] data_in2; + delete[] data_out2; + delete[] data_in3; + delete[] data_out3; +} + diff --git a/AMSS_NCKU_source/TwoPunctures.h b/AMSS_NCKU_source/TwoPunctures.h index 22fb359..5f95797 100644 --- a/AMSS_NCKU_source/TwoPunctures.h +++ b/AMSS_NCKU_source/TwoPunctures.h @@ -1,7 +1,8 @@ - #ifndef TWO_PUNCTURES_H #define TWO_PUNCTURES_H +#include + #define StencilSize 19 #define N_PlaneRelax 1 #define NRELAX 200 @@ -32,7 +33,7 @@ private: int npoints_A, npoints_B, npoints_phi; double target_M_plus, target_M_minus; - + double admMass; double adm_tol; @@ -42,6 +43,18 @@ private: int ntotal; + // ===== Precomputed spectral derivative matrices ===== + double *D1_A, *D2_A; + double *D1_B, *D2_B; + double *DF1_phi, *DF2_phi; + + // ===== Pre-allocated workspace for LineRelax (per-thread) ===== + int max_threads; + double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be; + double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be; + double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al; + double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al; + struct parameters { int nvar, n1, n2, n3; @@ -58,6 +71,28 @@ public: int Newtonmaxit); ~TwoPunctures(); + // 02/07: New/modified methods + void allocate_workspace(); + void free_workspace(); + void precompute_derivative_matrices(); + void build_cheb_deriv_matrices(int n, double *D1, double *D2); + void build_fourier_deriv_matrices(int N, double *DF1, double *DF2); + void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v); + void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q, + double *l, double *u_ws, double *d, double *y); + void LineRelax_be_omp(double *dv, + int const i, int const k, int const nvar, + int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, int **cols, + double **JFD, int tid); + void LineRelax_al_omp(double *dv, + int const j, int const k, int const nvar, + int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, + int **cols, double **JFD, int tid); + void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, int **cols, double **JFD); + void Solve(); void set_initial_guess(derivs v); int index(int i, int j, int k, int l, int a, int b, int c, int d); @@ -116,23 +151,11 @@ public: double BY_KKofxyz(double x, double y, double z); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); - void relax(double *dv, int const nvar, int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, int **cols, double **JFD); - void LineRelax_be(double *dv, - int const i, int const k, int const nvar, - int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, int **cols, - double **JFD); void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, int n3, derivs dv, derivs u, double *values); void LinEquations(double A, double B, double X, double R, double x, double r, double phi, double y, double z, derivs dU, derivs U, double *values); - void LineRelax_al(double *dv, - int const j, int const k, int const nvar, - int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, - int **cols, double **JFD); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void Save(char *fname); // provided by Vasileios Paschalidis (vpaschal@illinois.edu) @@ -141,4 +164,4 @@ public: void SpecCoef(parameters par, int ivar, double *v, double *cf); }; -#endif /* TWO_PUNCTURES_H */ +#endif /* TWO_PUNCTURES_H */ \ No newline at end of file diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index fc6c88e..18f1388 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -730,6 +730,12 @@ void bssn_class::Initialize() PhysTime = StartTime; Setup_Black_Hole_position(); } + + // Initialize sync caches (per-level, for predictor and corrector) + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; } //================================================================================================ @@ -981,6 +987,32 @@ bssn_class::~bssn_class() delete Azzz; #endif + // Destroy sync caches before GH + if (sync_cache_pre) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_pre[i].destroy(); + delete[] sync_cache_pre; + } + if (sync_cache_cor) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_cor[i].destroy(); + delete[] sync_cache_cor; + } + if (sync_cache_rp_coarse) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_rp_coarse[i].destroy(); + delete[] sync_cache_rp_coarse; + } + if (sync_cache_rp_fine) + { + for (int i = 0; i < GH->levels; i++) + sync_cache_rp_fine[i].destroy(); + delete[] sync_cache_rp_fine; + } + delete GH; #ifdef WithShell delete SH; @@ -2181,6 +2213,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2396,6 +2429,7 @@ void bssn_class::RecursiveStep(int lev) GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } #endif } @@ -2574,6 +2608,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } #endif } @@ -2740,6 +2775,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2754,6 +2790,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2772,6 +2809,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2787,6 +2825,7 @@ void bssn_class::ParallelStep() GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -3158,21 +3197,7 @@ void bssn_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -3190,9 +3215,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif @@ -3316,25 +3341,16 @@ void bssn_class::Step(int lev, int YN) #endif } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - - if (ERROR) - { - SH->Dump_Data(StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -3347,12 +3363,29 @@ void bssn_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } #endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); + SH->Dump_Data(StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif #if (MAPBH == 0) // for black hole position @@ -3528,24 +3561,7 @@ void bssn_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -3563,9 +3579,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #elif (AGM == 1) if (iter_count == 3) @@ -3685,26 +3701,16 @@ void bssn_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" - << iter_count << " variables at t = " - << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -3717,12 +3723,31 @@ void bssn_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } #endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); + SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif #if (MAPBH == 0) // for black hole position @@ -4034,22 +4059,7 @@ void bssn_class::Step(int lev, int YN) } Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -4067,15 +4077,15 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[fngfs + ShellPatch::gx], - cg->fgfs[fngfs + ShellPatch::gy], + cg->fgfs[fngfs + ShellPatch::gx], + cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gz], cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhody], @@ -4190,25 +4200,16 @@ void bssn_class::Step(int lev, int YN) } #endif } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " - << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -4221,9 +4222,27 @@ void bssn_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) - << " seconds! " << endl; + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + << " seconds! " << endl; + } + } +#endif + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); + SH->Dump_Data(StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); } } #endif @@ -4386,23 +4405,7 @@ void bssn_class::Step(int lev, int YN) Pp = Pp->next; } - // check error information - { - int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime - << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } + // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell // evolve Shell Patches @@ -4420,9 +4423,9 @@ void bssn_class::Step(int lev, int YN) { #if (AGM == 0) f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #elif (AGM == 1) if (iter_count == 3) @@ -4542,25 +4545,16 @@ void bssn_class::Step(int lev, int YN) sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count - << " variables at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -4573,11 +4567,30 @@ void bssn_class::Step(int lev, int YN) { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } +#endif + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); + SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } #endif // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -4943,11 +4956,19 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]); + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req); } + + // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); + + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + + // Complete non-blocking error reduction and check + MPI_Wait(&err_req, MPI_STATUS_IGNORE); if (ERROR) { Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); @@ -4959,10 +4980,6 @@ void bssn_class::Step(int lev, int YN) } } - // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); - - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); - #if (MAPBH == 0) // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -5140,30 +5157,34 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); - // check error information + // Non-blocking error reduction overlapped with Sync to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]); + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor); } + + // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); + + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + + // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync"); + + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) { Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count - << " variables at t = " << PhysTime + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } - // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); - - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); - - // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync"); - #if (MAPBH == 0) // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -5447,21 +5468,11 @@ void bssn_class::SHStep() #if (PSTR == 1 || PSTR == 2) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check"); #endif - // check error information + // Non-blocking error reduction overlapped with Synch to hide Allreduce latency + MPI_Request err_req; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - - if (ERROR) - { - SH->Dump_Data(StateList, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); } { @@ -5473,12 +5484,25 @@ void bssn_class::SHStep() { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } + // Complete non-blocking error reduction and check + MPI_Wait(&err_req, MPI_STATUS_IGNORE); + if (ERROR) + { + SH->Dump_Data(StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + // corrector for (iter_count = 1; iter_count < 4; iter_count++) { @@ -5621,21 +5645,11 @@ void bssn_class::SHStep() sPp = sPp->next; } } - // check error information + // Non-blocking error reduction overlapped with Synch to hide Allreduce latency + MPI_Request err_req_cor; { int erh = ERROR; - MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - } - if (ERROR) - { - SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); - if (myrank == 0) - { - if (ErrorMonitor->outfile) - ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count - << " variables at t = " << PhysTime << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); } { @@ -5647,12 +5661,26 @@ void bssn_class::SHStep() { prev_clock = curr_clock; curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; } } + // Complete non-blocking error reduction and check + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count + << " variables at t = " << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + sPp = SH->PatL; while (sPp) { @@ -5781,7 +5809,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -5791,21 +5819,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); @@ -5842,7 +5860,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif - Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -5852,21 +5870,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); @@ -5880,7 +5888,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif } - Parallel::Sync(GH->PatL[lev], SL, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -5938,24 +5946,14 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); #endif - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); @@ -5970,31 +5968,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); #endif - Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); #endif } - Parallel::Sync(GH->PatL[lev], SL, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); } } @@ -6045,24 +6033,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); #endif - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6079,31 +6057,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); #endif - Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); #endif } - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); } } @@ -6133,21 +6101,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) } #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6156,21 +6114,11 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) else // no time refinement levels and for all same time levels { #if (RPB == 0) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif - Pp = Pp->next; - } - Ppc = Ppc->next; - } #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6186,10 +6134,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) #else Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); #endif - Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); + Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); } - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); } } #undef MIXOUTB diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 740d3aa..db434e2 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -126,6 +126,11 @@ public: MyList *OldStateList, *DumpList; MyList *ConstraintList; + Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync + Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync + Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] + Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] + monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ConVMonitor; surface_integral *Waveshell; diff --git a/AMSS_NCKU_source/bssn_rhs.f90 b/AMSS_NCKU_source/bssn_rhs.f90 index 80908cb..246b219 100644 --- a/AMSS_NCKU_source/bssn_rhs.f90 +++ b/AMSS_NCKU_source/bssn_rhs.f90 @@ -106,7 +106,8 @@ call getpbh(BHN,Porg,Mass) #endif -!!! sanity check +!!! sanity check (disabled in production builds for performance) +#ifdef DEBUG dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) & @@ -136,6 +137,7 @@ gont = 1 return endif +#endif PI = dacos(-ONE) diff --git a/AMSS_NCKU_source/enforce_algebra.f90 b/AMSS_NCKU_source/enforce_algebra.f90 index 71f3da2..2a511a5 100644 --- a/AMSS_NCKU_source/enforce_algebra.f90 +++ b/AMSS_NCKU_source/enforce_algebra.f90 @@ -18,49 +18,61 @@ real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz !~~~~~~~> Local variable: - - real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg - real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz - real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz + + integer :: i,j,k + real*8 :: lgxx,lgyy,lgzz,ldetg + real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz + real*8 :: ltrA,lscale real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 !~~~~~~> - gxx = dxx + ONE - gyy = dyy + ONE - gzz = dzz + ONE + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) - detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & - gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz - gupxx = ( gyy * gzz - gyz * gyz ) / detg - gupxy = - ( gxy * gzz - gyz * gxz ) / detg - gupxz = ( gxy * gyz - gyy * gxz ) / detg - gupyy = ( gxx * gzz - gxz * gxz ) / detg - gupyz = - ( gxx * gyz - gxy * gxz ) / detg - gupzz = ( gxx * gyy - gxy * gxy ) / detg + lgxx = dxx(i,j,k) + ONE + lgyy = dyy(i,j,k) + ONE + lgzz = dzz(i,j,k) + ONE - trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz & - + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz) + ldetg = lgxx * lgyy * lgzz & + + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) & + - gxz(i,j,k) * lgyy * gxz(i,j,k) & + - gxy(i,j,k) * gxy(i,j,k) * lgzz & + - lgxx * gyz(i,j,k) * gyz(i,j,k) - Axx = Axx - F1o3 * gxx * trA - Axy = Axy - F1o3 * gxy * trA - Axz = Axz - F1o3 * gxz * trA - Ayy = Ayy - F1o3 * gyy * trA - Ayz = Ayz - F1o3 * gyz * trA - Azz = Azz - F1o3 * gzz * trA + lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg + lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg + lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg + lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg + lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg + lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg - detg = ONE / ( detg ** F1o3 ) - - gxx = gxx * detg - gxy = gxy * detg - gxz = gxz * detg - gyy = gyy * detg - gyz = gyz * detg - gzz = gzz * detg + ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & + + lgupzz * Azz(i,j,k) & + + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + + lgupyz * Ayz(i,j,k)) - dxx = gxx - ONE - dyy = gyy - ONE - dzz = gzz - ONE + Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA + Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA + Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA + Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA + Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA + Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA + + lscale = ONE / ( ldetg ** F1o3 ) + + dxx(i,j,k) = lgxx * lscale - ONE + gxy(i,j,k) = gxy(i,j,k) * lscale + gxz(i,j,k) = gxz(i,j,k) * lscale + dyy(i,j,k) = lgyy * lscale - ONE + gyz(i,j,k) = gyz(i,j,k) * lscale + dzz(i,j,k) = lgzz * lscale - ONE + + enddo + enddo + enddo return @@ -82,51 +94,71 @@ real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz !~~~~~~~> Local variable: - - real*8, dimension(ex(1),ex(2),ex(3)) :: trA - real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz - real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz + + integer :: i,j,k + real*8 :: lgxx,lgyy,lgzz,lscale + real*8 :: lgxy,lgxz,lgyz + real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz + real*8 :: ltrA real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 !~~~~~~> - gxx = dxx + ONE - gyy = dyy + ONE - gzz = dzz + ONE -! for g - gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & - gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) - gupzz = ONE / ( gupzz ** F1o3 ) - - gxx = gxx * gupzz - gxy = gxy * gupzz - gxz = gxz * gupzz - gyy = gyy * gupzz - gyz = gyz * gupzz - gzz = gzz * gupzz +! for g: normalize determinant first + lgxx = dxx(i,j,k) + ONE + lgyy = dyy(i,j,k) + ONE + lgzz = dzz(i,j,k) + ONE + lgxy = gxy(i,j,k) + lgxz = gxz(i,j,k) + lgyz = gyz(i,j,k) - dxx = gxx - ONE - dyy = gyy - ONE - dzz = gzz - ONE -! for A + lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & + + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & + - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz - gupxx = ( gyy * gzz - gyz * gyz ) - gupxy = - ( gxy * gzz - gyz * gxz ) - gupxz = ( gxy * gyz - gyy * gxz ) - gupyy = ( gxx * gzz - gxz * gxz ) - gupyz = - ( gxx * gyz - gxy * gxz ) - gupzz = ( gxx * gyy - gxy * gxy ) + lscale = ONE / ( lscale ** F1o3 ) - trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz & - + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz) + lgxx = lgxx * lscale + lgxy = lgxy * lscale + lgxz = lgxz * lscale + lgyy = lgyy * lscale + lgyz = lgyz * lscale + lgzz = lgzz * lscale - Axx = Axx - F1o3 * gxx * trA - Axy = Axy - F1o3 * gxy * trA - Axz = Axz - F1o3 * gxz * trA - Ayy = Ayy - F1o3 * gyy * trA - Ayz = Ayz - F1o3 * gyz * trA - Azz = Azz - F1o3 * gzz * trA + dxx(i,j,k) = lgxx - ONE + gxy(i,j,k) = lgxy + gxz(i,j,k) = lgxz + dyy(i,j,k) = lgyy - ONE + gyz(i,j,k) = lgyz + dzz(i,j,k) = lgzz - ONE + +! for A: trace-free using normalized metric (det=1, no division needed) + lgupxx = ( lgyy * lgzz - lgyz * lgyz ) + lgupxy = - ( lgxy * lgzz - lgyz * lgxz ) + lgupxz = ( lgxy * lgyz - lgyy * lgxz ) + lgupyy = ( lgxx * lgzz - lgxz * lgxz ) + lgupyz = - ( lgxx * lgyz - lgxy * lgxz ) + lgupzz = ( lgxx * lgyy - lgxy * lgxy ) + + ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & + + lgupzz * Azz(i,j,k) & + + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & + + lgupyz * Ayz(i,j,k)) + + Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA + Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA + Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA + Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA + Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA + Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA + + enddo + enddo + enddo return diff --git a/AMSS_NCKU_source/fmisc.f90 b/AMSS_NCKU_source/fmisc.f90 index 81c5a62..1b57677 100644 --- a/AMSS_NCKU_source/fmisc.f90 +++ b/AMSS_NCKU_source/fmisc.f90 @@ -324,7 +324,6 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) @@ -350,7 +349,6 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) @@ -379,7 +377,6 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) @@ -886,7 +883,6 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) @@ -912,7 +908,6 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) @@ -941,7 +936,6 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA) integer::i - funcc = 0.d0 funcc(1:extc(1),1:extc(2),1:extc(3)) = func do i=0,ord-1 funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) @@ -1118,64 +1112,65 @@ end subroutine d2dump ! Lagrangian polynomial interpolation !------------------------------------------------------------------------------ - subroutine polint(xa,ya,x,y,dy,ordn) - + subroutine polint(xa, ya, x, y, dy, ordn) implicit none -!~~~~~~> Input Parameter: - integer,intent(in) :: ordn - real*8, dimension(ordn), intent(in) :: xa,ya + integer, intent(in) :: ordn + real*8, dimension(ordn), intent(in) :: xa, ya real*8, intent(in) :: x - real*8, intent(out) :: y,dy + real*8, intent(out) :: y, dy -!~~~~~~> Other parameter: + integer :: i, m, ns, n_m + real*8, dimension(ordn) :: c, d, ho + real*8 :: dif, dift, hp, h, den_val - integer :: m,n,ns - real*8, dimension(ordn) :: c,d,den,ho - real*8 :: dif,dift + c = ya + d = ya + ho = xa - x -!~~~~~~> + ns = 1 + dif = abs(x - xa(1)) - n=ordn - m=ordn - - c=ya - d=ya - ho=xa-x - - ns=1 - dif=abs(x-xa(1)) - do m=1,n - dift=abs(x-xa(m)) - if(dift < dif) then - ns=m - dif=dift - end if + do i = 2, ordn + dift = abs(x - xa(i)) + if (dift < dif) then + ns = i + dif = dift + end if end do - y=ya(ns) - ns=ns-1 - do m=1,n-1 - den(1:n-m)=ho(1:n-m)-ho(1+m:n) - if (any(den(1:n-m) == 0.0))then - write(*,*) 'failure in polint for point',x - write(*,*) 'with input points: ',xa - stop - endif - den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m) - d(1:n-m)=ho(1+m:n)*den(1:n-m) - c(1:n-m)=ho(1:n-m)*den(1:n-m) - if (2*ns < n-m) then - dy=c(ns+1) + y = ya(ns) + ns = ns - 1 + + do m = 1, ordn - 1 + n_m = ordn - m + do i = 1, n_m + hp = ho(i) + h = ho(i+m) + den_val = hp - h + + if (den_val == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + + den_val = (c(i+1) - d(i)) / den_val + + d(i) = h * den_val + c(i) = hp * den_val + end do + + if (2 * ns < n_m) then + dy = c(ns + 1) else - dy=d(ns) - ns=ns-1 + dy = d(ns) + ns = ns - 1 end if - y=y+dy + y = y + dy end do return - end subroutine polint !------------------------------------------------------------------------------ ! @@ -1183,35 +1178,37 @@ end subroutine d2dump ! !------------------------------------------------------------------------------ subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) - implicit none -!~~~~~~> Input parameters: integer,intent(in) :: ordn real*8, dimension(1:ordn), intent(in) :: x1a,x2a real*8, dimension(1:ordn,1:ordn), intent(in) :: ya real*8, intent(in) :: x1,x2 real*8, intent(out) :: y,dy -!~~~~~~> Other parameters: - +#ifdef POLINT_LEGACY_ORDER integer :: i,m real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: yntmp m=size(x1a) - do i=1,m - yntmp=ya(i,:) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) - end do - call polint(x1a,ymtmp,x1,y,dy,ordn) +#else + integer :: j + real*8, dimension(ordn) :: ymtmp + real*8 :: dy_temp + + do j=1,ordn + call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn) + end do + call polint(x2a, ymtmp, x2, y, dy, ordn) +#endif return - end subroutine polin2 !------------------------------------------------------------------------------ ! @@ -1219,18 +1216,15 @@ end subroutine d2dump ! !------------------------------------------------------------------------------ subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) - implicit none -!~~~~~~> Input parameters: integer,intent(in) :: ordn real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya real*8, intent(in) :: x1,x2,x3 real*8, intent(out) :: y,dy -!~~~~~~> Other parameters: - +#ifdef POLINT_LEGACY_ORDER integer :: i,j,m,n real*8, dimension(ordn,ordn) :: yatmp real*8, dimension(ordn) :: ymtmp @@ -1239,27 +1233,36 @@ end subroutine d2dump m=size(x1a) n=size(x2a) - do i=1,m do j=1,n - yqtmp=ya(i,j,:) call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) - end do - yntmp=yatmp(i,:) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) - end do - call polint(x1a,ymtmp,x1,y,dy,ordn) +#else + integer :: j, k + real*8, dimension(ordn,ordn) :: yatmp + real*8, dimension(ordn) :: ymtmp + real*8 :: dy_temp + + do k=1,ordn + do j=1,ordn + call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn) + end do + end do + do k=1,ordn + call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn) + end do + call polint(x3a, ymtmp, x3, y, dy, ordn) +#endif return - end subroutine polin3 !-------------------------------------------------------------------------------------- -! calculate L2norm +! calculate L2norm subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& f,f_out,gw) @@ -1276,7 +1279,9 @@ end subroutine d2dump real*8 :: dX, dY, dZ integer::imin,jmin,kmin integer::imax,jmax,kmax - integer::i,j,k + integer::i,j,k,n_elements + real*8, dimension(:), allocatable :: f_flat + real*8, external :: DDOT dX = X(2) - X(1) dY = Y(2) - Y(1) @@ -1300,7 +1305,12 @@ if(dabs(X(1)-xmin) < dX) imin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1 -f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax)) +! Optimized with oneMKL BLAS DDOT for dot product +n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) +allocate(f_flat(n_elements)) +f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) +f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) +deallocate(f_flat) f_out = f_out*dX*dY*dZ @@ -1325,7 +1335,9 @@ f_out = f_out*dX*dY*dZ real*8 :: dX, dY, dZ integer::imin,jmin,kmin integer::imax,jmax,kmax - integer::i,j,k + integer::i,j,k,n_elements + real*8, dimension(:), allocatable :: f_flat + real*8, external :: DDOT real*8 :: PIo4 @@ -1388,7 +1400,12 @@ if(Symmetry==2)then if(dabs(ymin+gw*dY) #include #endif -/* Linear equation solution by Gauss-Jordan elimination. + +// Intel oneMKL LAPACK interface +#include +/* Linear equation solution using Intel oneMKL LAPACK. a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input containing the right-hand side vectors. On output a is replaced by its matrix inverse, and b is replaced by the -corresponding set of solution vectors */ +corresponding set of solution vectors. + +Mathematical equivalence: + Solves: A * x = b => x = A^(-1) * b + Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results + within numerical precision. */ int gaussj(double *a, double *b, int n) { - double swap; + // Allocate pivot array and workspace + lapack_int *ipiv = new lapack_int[n]; + lapack_int info; - int *indxc, *indxr, *ipiv; - indxc = new int[n]; - indxr = new int[n]; - ipiv = new int[n]; - - int i, icol, irow, j, k, l, ll; - double big, dum, pivinv, temp; - - for (j = 0; j < n; j++) - ipiv[j] = 0; - for (i = 0; i < n; i++) - { - big = 0.0; - for (j = 0; j < n; j++) - if (ipiv[j] != 1) - for (k = 0; k < n; k++) - { - if (ipiv[k] == 0) - { - if (fabs(a[j * n + k]) >= big) - { - big = fabs(a[j * n + k]); - irow = j; - icol = k; - } - } - else if (ipiv[k] > 1) - { - cout << "gaussj: Singular Matrix-1" << endl; - for (int ii = 0; ii < n; ii++) - { - for (int jj = 0; jj < n; jj++) - cout << a[ii * n + jj] << " "; - cout << endl; - } - return 1; // error return - } - } - - ipiv[icol] = ipiv[icol] + 1; - if (irow != icol) - { - for (l = 0; l < n; l++) - { - swap = a[irow * n + l]; - a[irow * n + l] = a[icol * n + l]; - a[icol * n + l] = swap; - } - - swap = b[irow]; - b[irow] = b[icol]; - b[icol] = swap; - } - - indxr[i] = irow; - indxc[i] = icol; - - if (a[icol * n + icol] == 0.0) - { - cout << "gaussj: Singular Matrix-2" << endl; - for (int ii = 0; ii < n; ii++) - { - for (int jj = 0; jj < n; jj++) - cout << a[ii * n + jj] << " "; - cout << endl; - } - return 1; // error return - } - - pivinv = 1.0 / a[icol * n + icol]; - a[icol * n + icol] = 1.0; - for (l = 0; l < n; l++) - a[icol * n + l] *= pivinv; - b[icol] *= pivinv; - for (ll = 0; ll < n; ll++) - if (ll != icol) - { - dum = a[ll * n + icol]; - a[ll * n + icol] = 0.0; - for (l = 0; l < n; l++) - a[ll * n + l] -= a[icol * n + l] * dum; - b[ll] -= b[icol] * dum; - } + // Make a copy of matrix a for solving (dgesv modifies it to LU form) + double *a_copy = new double[n * n]; + for (int i = 0; i < n * n; i++) { + a_copy[i] = a[i]; } - for (l = n - 1; l >= 0; l--) - { - if (indxr[l] != indxc[l]) - for (k = 0; k < n; k++) - { - swap = a[k * n + indxr[l]]; - a[k * n + indxr[l]] = a[k * n + indxc[l]]; - a[k * n + indxc[l]] = swap; - } + // Step 1: Solve linear system A*x = b using LU decomposition + // LAPACKE_dgesv uses column-major by default, but we use row-major + info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; + } + + // Step 2: Compute matrix inverse A^(-1) using LU factorization + // First do LU factorization of original matrix a + info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; + } + + // Then compute inverse from LU factorization + info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); + + if (info != 0) { + cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; + delete[] ipiv; + delete[] a_copy; + return 1; } - delete[] indxc; - delete[] indxr; delete[] ipiv; + delete[] a_copy; return 0; } diff --git a/AMSS_NCKU_source/ilucg.f90 b/AMSS_NCKU_source/ilucg.f90 index 90c36f5..3443353 100644 --- a/AMSS_NCKU_source/ilucg.f90 +++ b/AMSS_NCKU_source/ilucg.f90 @@ -512,11 +512,10 @@ IMPLICIT DOUBLE PRECISION (A-H,O-Z) DIMENSION V(N),W(N) ! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT. +! Optimized using Intel oneMKL BLAS ddot +! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i) - SUM = 0.0D0 - DO 10 I = 1,N - SUM = SUM + V(I)*W(I) -10 CONTINUE - DGVV = SUM + DOUBLE PRECISION, EXTERNAL :: DDOT + DGVV = DDOT(N, V, 1, W, 1) RETURN END diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h index ca67877..164785a 100644 --- a/AMSS_NCKU_source/macrodef.h +++ b/AMSS_NCKU_source/macrodef.h @@ -2,7 +2,7 @@ #ifndef MICRODEF_H #define MICRODEF_H -#include "microdef.fh" +#include "macrodef.fh" // application parameters diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 0e2a08d..f2d4e3c 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -16,6 +16,12 @@ include makefile.inc .cu.o: $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) +TwoPunctures.o: TwoPunctures.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@ + +TwoPunctureABE.o: TwoPunctureABE.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@ + # Input files C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ @@ -96,7 +102,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS) + $(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) clean: rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 00b4bc3..25d6cd0 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -1,19 +1,30 @@ - +## GCC version (commented out) ## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran -filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ +## Intel oneAPI version with oneMKL (Optimized for performance) +filein = -I/usr/include/ -I${MKLROOT}/include -## LDLIBS = -L/usr/lib/x86_64-linux-gnu -lmpich -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran +## Using sequential MKL (OpenMP disabled for better single-threaded performance) +## Added -lifcore for Intel Fortran runtime and -limf for Intel math library +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -CXXAPPFLAGS = -O0 -Wno-deprecated -Dfortran3 -Dnewc -#f90appflags = -O0 -fpp -f90appflags = -O0 -x f95-cpp-input -f90 = gfortran -f77 = gfortran -CXX = g++ -CC = gcc -CLINKER = mpic++ +## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) +## -fprofile-instr-use: use collected profile data to guide optimization decisions +## (branch prediction, basic block layout, inlining, loop unrolling) +PROFDATA = /home/amss/AMSS-NCKU/pgo_profile/default.profdata +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -Dfortran3 -Dnewc -I${MKLROOT}/include +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -align array64byte -fpp -I${MKLROOT}/include +f90 = ifx +f77 = ifx +CXX = icpx +CC = icx +CLINKER = mpiicpx Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include diff --git a/AMSS_NCKU_source/surface_integral.C b/AMSS_NCKU_source/surface_integral.C index 410aee2..c2b7b67 100644 --- a/AMSS_NCKU_source/surface_integral.C +++ b/AMSS_NCKU_source/surface_integral.C @@ -220,16 +220,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * pox[2][n] = rex * nz_g[n]; } - double *shellf; - shellf = new double[n_tot * InList]; - - GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry); - int mp, Lp, Nmin, Nmax; - mp = n_tot / cpusize; Lp = n_tot - cpusize * mp; - if (Lp > myrank) { Nmin = myrank * mp + myrank; @@ -241,6 +234,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * Nmax = Nmin + mp - 1; } + double *shellf; + shellf = new double[n_tot * InList]; + + GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); + //|~~~~~> Integrate the dot product of Dphi with the surface normal. double *RP_out, *IP_out; @@ -363,8 +361,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -556,8 +563,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -735,8 +751,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4 } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -984,8 +1009,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -1419,8 +1453,17 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -1854,8 +1897,17 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -2040,8 +2092,17 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var * } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -2226,8 +2287,17 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. @@ -2314,25 +2384,9 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var pox[2][n] = rex * nz_g[n]; } - double *shellf; - shellf = new double[n_tot * InList]; - - // we have assumed there is only one box on this level, - // so we do not need loop boxes - GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry); - - double Mass_out = 0; - double ang_outx, ang_outy, ang_outz; - double p_outx, p_outy, p_outz; - ang_outx = ang_outy = ang_outz = 0.0; - p_outx = p_outy = p_outz = 0.0; - const double f1o8 = 0.125; - int mp, Lp, Nmin, Nmax; - mp = n_tot / cpusize; Lp = n_tot - cpusize * mp; - if (Lp > myrank) { Nmin = myrank * mp + myrank; @@ -2344,6 +2398,20 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var Nmax = Nmin + mp - 1; } + double *shellf; + shellf = new double[n_tot * InList]; + + // we have assumed there is only one box on this level, + // so we do not need loop boxes + GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); + + double Mass_out = 0; + double ang_outx, ang_outy, ang_outz; + double p_outx, p_outy, p_outz; + ang_outx = ang_outy = ang_outz = 0.0; + p_outx = p_outy = p_outz = 0.0; + const double f1o8 = 0.125; + double Chi, Psi; double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz; double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz; @@ -2464,15 +2532,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var } } - MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz}; + double scalar_in[7]; + MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; + px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6]; + } #ifdef GaussInt mass = mass * rex * rex * dphi * factor; @@ -2735,15 +2801,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var } } - MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - - MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - - MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here); - MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here); + { + double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz}; + double scalar_in[7]; + MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here); + mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; + px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6]; + } #ifdef GaussInt mass = mass * rex * rex * dphi * factor; @@ -3020,15 +3084,13 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c } } - MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - - MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz}; + double scalar_in[7]; + MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; + px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6]; + } #ifdef GaussInt mass = mass * rex * rex * dphi * factor; @@ -3607,8 +3669,17 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH, } //|------+ Communicate and sum the results from each processor. - MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + { + double *RPIP_out = new double[2 * NN]; + double *RPIP = new double[2 * NN]; + memcpy(RPIP_out, RP_out, NN * sizeof(double)); + memcpy(RPIP_out + NN, IP_out, NN * sizeof(double)); + MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + memcpy(RP, RPIP, NN * sizeof(double)); + memcpy(IP, RPIP + NN, NN * sizeof(double)); + delete[] RPIP_out; + delete[] RPIP; + } //|------= Free memory. diff --git a/makefile_and_run.py b/makefile_and_run.py index 73c9ff6..096ed58 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -10,6 +10,18 @@ import AMSS_NCKU_Input as input_data import subprocess +import time +## CPU core binding configuration using taskset +## taskset ensures all child processes inherit the CPU affinity mask +## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) +## Format: taskset -c 4-55,60-111 ensures processes only run on these cores +#NUMACTL_CPU_BIND = "taskset -c 0-111" +NUMACTL_CPU_BIND = "taskset -c 16-47,64-95" + +## Build parallelism configuration +## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores +## Set make -j to utilize available cores for faster builds +BUILD_JOBS = 96 ################################################################## @@ -26,11 +38,11 @@ def makefile_ABE(): print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) print( ) - ## Build command + ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = "make -j96" + " ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" elif (input_data.GPU_Calculation == "yes"): - makefile_command = "make -j4" + " ABEGPU" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" else: print( " CPU/GPU numerical calculation setting is wrong " ) print( ) @@ -67,8 +79,8 @@ def makefile_TwoPunctureABE(): print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " ) print( ) - ## Build command - makefile_command = "make" + " TwoPunctureABE" + ## Build command with CPU binding to nohz_full cores + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" ## Execute the command with subprocess.Popen and stream output makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) @@ -105,10 +117,11 @@ def run_ABE(): ## Define the command to run; cast other values to strings as needed if (input_data.GPU_Calculation == "no"): - mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output @@ -141,13 +154,14 @@ def run_ABE(): ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE def run_TwoPunctureABE(): - + tp_time1=time.time() print( ) print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print( ) ## Define the command to run - TwoPuncture_command = "./TwoPunctureABE" + #TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" + TwoPuncture_command = " ./TwoPunctureABE" TwoPuncture_command_outfile = "TwoPunctureABE_out.log" ## Execute the command with subprocess.Popen and stream output @@ -168,7 +182,9 @@ def run_TwoPunctureABE(): print( ) print( " The TwoPunctureABE simulation is finished " ) print( ) - + tp_time2=time.time() + et=tp_time2-tp_time1 + print(f"Used time: {et}") return ################################################################## diff --git a/parallel_plot_helper.py b/parallel_plot_helper.py new file mode 100644 index 0000000..c1168fa --- /dev/null +++ b/parallel_plot_helper.py @@ -0,0 +1,29 @@ +import multiprocessing + +def run_plot_task(task): + """Execute a single plotting task. + + Parameters + ---------- + task : tuple + A tuple of (function, args_tuple) where function is a callable + plotting function and args_tuple contains its arguments. + """ + func, args = task + return func(*args) + + +def run_plot_tasks_parallel(plot_tasks): + """Execute a list of independent plotting tasks in parallel. + + Uses the 'fork' context to create worker processes so that the main + script is NOT re-imported/re-executed in child processes. + + Parameters + ---------- + plot_tasks : list of tuples + Each element is (function, args_tuple). + """ + ctx = multiprocessing.get_context('fork') + with ctx.Pool() as pool: + pool.map(run_plot_task, plot_tasks) diff --git a/pgo_profile/PGO_Profile_Analysis.md b/pgo_profile/PGO_Profile_Analysis.md new file mode 100644 index 0000000..bff40c0 --- /dev/null +++ b/pgo_profile/PGO_Profile_Analysis.md @@ -0,0 +1,97 @@ +# AMSS-NCKU PGO Profile Analysis Report + +## 1. Profiling Environment + +| Item | Value | +|------|-------| +| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) | +| Instrumentation Flag | `-fprofile-instr-generate` | +| Optimization Level (instrumented) | `-O2 -xHost -fma` | +| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) | +| Profile File | `default_9725750769337483397_0.profraw` (327 KB) | +| Merged Profile | `default.profdata` (394 KB) | +| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` | + +## 2. Reduced Simulation Parameters (for profiling run) + +| Parameter | Production Value | Profiling Value | +|-----------|-----------------|-----------------| +| MPI_processes | 64 | 1 | +| grid_level | 9 | 4 | +| static_grid_level | 5 | 3 | +| static_grid_number | 96 | 24 | +| moving_grid_number | 48 | 16 | +| largest_box_xyz_max | 320^3 | 160^3 | +| Final_Evolution_Time | 1000.0 | 10.0 | +| Evolution_Step_Number | 10,000,000 | 1,000 | +| Detector_Number | 12 | 2 | + +## 3. Profile Summary + +| Metric | Value | +|--------|-------| +| Total instrumented functions | 1,392 | +| Functions with non-zero counts | 117 (8.4%) | +| Functions with zero counts | 1,275 (91.6%) | +| Maximum function entry count | 386,459,248 | +| Maximum internal block count | 370,477,680 | +| Total block count | 4,198,023,118 | + +## 4. Top 20 Hotspot Functions + +| Rank | Total Count | Max Block Count | Function | Category | +|------|------------|-----------------|----------|----------| +| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation | +| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation | +| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution | +| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary | +| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil | +| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision | +| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator | +| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation | +| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences | +| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility | +| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives | +| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction | +| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation | +| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy | +| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation | +| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights | +| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation | +| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary | +| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) | +| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation | + +## 5. Hotspot Category Breakdown + +Top 20 functions account for ~98% of total execution counts: + +| Category | Functions | Combined Count | Share | +|----------|-----------|---------------|-------| +| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% | +| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% | +| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% | +| Time integration | rungekutta4_rout_ | ~119M | ~3% | +| Dissipation | kodis_ | ~92M | ~2% | +| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% | + +## 6. Conclusions + +1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts. +2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets. +3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout. +4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization. +5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs. + +## 7. PGO Phase 2 Usage + +To apply the profile, use the following flags in `makefile.inc`: + +```makefile +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ + -Dfortran3 -Dnewc -I${MKLROOT}/include +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ + -align array64byte -fpp -I${MKLROOT}/include +``` diff --git a/pgo_profile/default.profdata b/pgo_profile/default.profdata new file mode 100644 index 0000000..dfac738 Binary files /dev/null and b/pgo_profile/default.profdata differ diff --git a/pgo_profile/default_9725750769337483397_0.profraw b/pgo_profile/default_9725750769337483397_0.profraw new file mode 100644 index 0000000..c9c2485 Binary files /dev/null and b/pgo_profile/default_9725750769337483397_0.profraw differ diff --git a/plot_GW_strain_amplitude_xiaoqu.py b/plot_GW_strain_amplitude_xiaoqu.py index 739f3d4..cf7b098 100755 --- a/plot_GW_strain_amplitude_xiaoqu.py +++ b/plot_GW_strain_amplitude_xiaoqu.py @@ -11,6 +11,8 @@ import numpy ## numpy for array operations import scipy ## scipy for interpolation and signal processing import math +import matplotlib +matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety import matplotlib.pyplot as plt ## matplotlib for plotting import os ## os for system/file operations diff --git a/plot_binary_data.py b/plot_binary_data.py index 0694f4f..2aca1c7 100755 --- a/plot_binary_data.py +++ b/plot_binary_data.py @@ -8,16 +8,23 @@ ## ################################################# +## Restrict OpenMP to one thread per process so that running +## many workers in parallel does not create an O(workers * BLAS_threads) +## thread explosion. The variable MUST be set before numpy/scipy +## are imported, because the BLAS library reads them only at load time. +import os +os.environ.setdefault("OMP_NUM_THREADS", "1") + import numpy import scipy +import matplotlib +matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from mpl_toolkits.mplot3d import Axes3D ## import torch import AMSS_NCKU_Input as input_data -import os - ######################################################################################### @@ -192,3 +199,19 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ): #################################################################################### + +#################################################################################### +## Allow this module to be run as a standalone script so that each +## binary-data plot can be executed in a fresh subprocess whose BLAS +## environment variables (set above) take effect before numpy loads. +## +## Usage: python3 plot_binary_data.py +#################################################################################### + +if __name__ == '__main__': + import sys + if len(sys.argv) != 4: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3]) + diff --git a/plot_xiaoqu.py b/plot_xiaoqu.py index 7711d5a..47970cf 100755 --- a/plot_xiaoqu.py +++ b/plot_xiaoqu.py @@ -8,6 +8,8 @@ ################################################# import numpy ## numpy for array operations +import matplotlib +matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety import matplotlib.pyplot as plt ## matplotlib for plotting from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots import glob @@ -15,6 +17,9 @@ import os ## operating system utilities import plot_binary_data import AMSS_NCKU_Input as input_data +import subprocess +import sys +import multiprocessing # plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots @@ -50,10 +55,40 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ): file_list.append(x) print(x) - ## Plot each file in the list + ## Plot each file in parallel using subprocesses. + ## Each subprocess is a fresh Python process where the BLAS thread-count + ## environment variables (set at the top of plot_binary_data.py) take + ## effect before numpy is imported. This avoids the thread explosion + ## that occurs when multiprocessing.Pool with 'fork' context inherits + ## already-initialized multi-threaded BLAS from the parent. + script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" ) + max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0 + + running = [] + failed = [] for filename in file_list: print(filename) - plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir) + proc = subprocess.Popen( + [sys.executable, script, filename, binary_outdir, figure_outdir], + ) + running.append( (proc, filename) ) + ## Keep at most max_workers subprocesses active at a time + if len(running) >= max_workers: + p, fn = running.pop(0) + p.wait() + if p.returncode != 0: + failed.append(fn) + + ## Wait for all remaining subprocesses to finish + for p, fn in running: + p.wait() + if p.returncode != 0: + failed.append(fn) + + if failed: + print( " WARNING: the following binary data plots failed:" ) + for fn in failed: + print( " ", fn ) print( ) print( " Binary Data Plot Has been Finished " )