Compare commits

..

5 Commits

Author SHA1 Message Date
6fffaa13f6 Optimize buffer_width dynamically based on FD order to improve scalability 2026-01-31 19:04:19 +08:00
6684016e8c Optimize MPI domain decomposition min_width calculation to improve scalability 2026-01-31 16:23:16 +08:00
d11eaa2242 Optimize bssn_rhs.f90: Fuse loops for metric inversion and Christoffel symbols to improve cache locality 2026-01-21 11:22:33 +08:00
ef96766e22 优化 compute_rhs_bssn 热点路径并加入 NaN 检查开关
- 用 DEBUG_NAN_CHECK 宏按需启用 NaN 检查,并在输入/宏生成器中新增 Debug_NaN_Check 配置
  - 逆度量改为先求行列式再乘法展开,减少除法;并在 Gam^i/Christoffel 处提取公共子表达式
  - 预置批量 fderivs 辅助例程,便于后续矢量化/合并导数计算
  - 将默认 MPI_processes 调整为 8

  变更涉及:

  - AMSS_NCKU_source/bssn_rhs.f90
  - generate_macrodef.py
  - AMSS_NCKU_Input.py
  - AMSS_NCKU_Input_Mini.py
  - inputfile_example/AMSS_NCKU_Input.py
  - AMSS_NCKU_source/diff_new.f90

TODO: fmisc.f90 polint()
2026-01-20 19:37:26 +08:00
ae7b77e44c Setup GW150914-mini test case for laptop development
- Add AMSS_NCKU_Input_Mini.py with reduced grid resolution and MPI processes
- Add AMSS_NCKU_MiniProgram.py launcher with automatic configuration swapping
- Update makefile_and_run.py to reduce build jobs and remove CPU binding for laptop
- Update .gitignore to exclude GW150914-mini output directory
2026-01-20 00:31:40 +08:00
37 changed files with 15691 additions and 17755 deletions

1
.gitignore vendored
View File

@@ -1,6 +1,7 @@
__pycache__ __pycache__
GW150914 GW150914
GW150914-origin GW150914-origin
GW150914-mini
docs docs
*.tmp *.tmp

View File

@@ -16,12 +16,14 @@ import numpy
File_directory = "GW150914" ## output file directory File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long ## The file directory name should not be too long
MPI_processes = 64 ## number of mpi processes used in the simulation MPI_processes = 8 ## number of mpi processes used in the simulation
GPU_Calculation = "no" ## Use GPU or not GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
CPU_Part = 1.0 CPU_Part = 1.0
GPU_Part = 0.0 GPU_Part = 0.0
Debug_NaN_Check = 0 ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
################################################# #################################################

233
AMSS_NCKU_Input_Mini.py Normal file
View File

@@ -0,0 +1,233 @@
#################################################
##
## This file provides the input parameters required for numerical relativity.
## XIAOQU
## 2024/03/19 --- 2025/09/14
## Modified for GW150914-mini test case
##
#################################################
import numpy
#################################################
## Setting MPI processes and the output file directory
File_directory = "GW150914-mini" ## output file directory
Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long
MPI_processes = 4 ## number of mpi processes used in the simulation (Reduced for laptop)
GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
CPU_Part = 1.0
GPU_Part = 0.0
#################################################
#################################################
## Setting the physical system and numerical method
Symmetry = "equatorial-symmetry" ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
Equation_Class = "BSSN" ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"
## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
Initial_Data_Method = "Ansorg-TwoPuncture" ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
Time_Evolution_Method = "runge-kutta-45" ## time evolution method: choose "runge-kutta-45"
Finite_Diffenence_Method = "4th-order" ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
Debug_NaN_Check = 0 ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
#################################################
#################################################
## Setting the time evolutionary information
Start_Evolution_Time = 0.0 ## start evolution time t0
Final_Evolution_Time = 100.0 ## final evolution time t1 (Reduced for quick test)
Check_Time = 10.0
Dump_Time = 10.0 ## time inteval dT for dumping binary data
D2_Dump_Time = 10.0 ## dump the ascii data for 2d surface after dT'
Analysis_Time = 1.0 ## dump the puncture position and GW psi4 after dT"
Evolution_Step_Number = 10000000 ## stop the calculation after the maximal step number
Courant_Factor = 0.5 ## Courant Factor
Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength
#################################################
#################################################
## Setting the grid structure
basic_grid_set = "Patch" ## grid structure: choose "Patch" or "Shell-Patch"
grid_center_set = "Cell" ## grid center: chose "Cell" or "Vertex"
grid_level = 7 ## total number of AMR grid levels (Reduced from 9)
static_grid_level = 4 ## number of AMR static grid levels (Reduced from 5)
moving_grid_level = grid_level - static_grid_level ## number of AMR moving grid levels
analysis_level = 0
refinement_level = 3 ## time refinement start from this grid level
largest_box_xyz_max = [320.0, 320.0, 320.0] ## scale of the largest box
## not ne cess ary to be cubic for "Patch" grid s tructure
## need to be a cubic box for "Shell-Patch" grid structure
largest_box_xyz_min = - numpy.array(largest_box_xyz_max)
static_grid_number = 48 ## grid points of each static AMR grid (in x direction) (Reduced from 96)
## (grid points in y and z directions are automatically adjusted)
moving_grid_number = 24 ## grid points of each moving AMR grid (Reduced from 48)
shell_grid_number = [32, 32, 100] ## grid points of Shell-Patch grid
## in (phi, theta, r) direction
devide_factor = 2.0 ## resolution between different grid levels dh0/dh1, only support 2.0 now
static_grid_type = 'Linear' ## AMR static grid structure , only supports "Linear"
moving_grid_type = 'Linear' ## AMR moving grid structure , only supports "Linear"
quarter_sphere_number = 48 ## grid number of 1/4 s pher ical surface (Reduced from 96)
## (which is needed for evaluating the spherical surface integral)
#################################################
#################################################
## Setting the puncture information
puncture_number = 2
position_BH = numpy.zeros( (puncture_number, 3) )
parameter_BH = numpy.zeros( (puncture_number, 3) )
dimensionless_spin_BH = numpy.zeros( (puncture_number, 3) )
momentum_BH = numpy.zeros( (puncture_number, 3) )
puncture_data_set = "Manually" ## Method to give Punctures positions and momentum
## choose "Manually" or "Automatically-BBH"
## Prefer to choose "Manually", because "Automatically-BBH" is developing now
## initial orbital distance and ellipticity for BBHs system
## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
Distance = 10.0
e0 = 0.0
## black hole parameter (M Q* a*)
parameter_BH[0] = [ 36.0/(36.0+29.0), 0.0, +0.31 ]
parameter_BH[1] = [ 29.0/(36.0+29.0), 0.0, -0.46 ]
## dimensionless spin in each direction
dimensionless_spin_BH[0] = [ 0.0, 0.0, +0.31 ]
dimensionless_spin_BH[1] = [ 0.0, 0.0, -0.46 ]
## use Brugmann's convention
## -----0-----> y
## - +
#---------------------------------------------
## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
## initial position for each puncture
position_BH[0] = [ 0.0, 10.0*29.0/(36.0+29.0), 0.0 ]
position_BH[1] = [ 0.0, -10.0*36.0/(36.0+29.0), 0.0 ]
## initial mumentum for each puncture
## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
momentum_BH[0] = [ -0.09530152296974252, -0.00084541526517121, 0.0 ]
momentum_BH[1] = [ +0.09530152296974252, +0.00084541526517121, 0.0 ]
#################################################
#################################################
## Setting the gravitational wave information
GW_L_max = 4 ## maximal L number in gravitational wave
GW_M_max = 4 ## maximal M number in gravitational wave
Detector_Number = 12 ## number of dector
Detector_Rmin = 50.0 ## nearest dector distance
Detector_Rmax = 160.0 ## farest dector distance
#################################################
#################################################
## Setting the apprent horizon
AHF_Find = "no" ## whether to find the apparent horizon: choose "yes" or "no"
AHF_Find_Every = 24
AHF_Dump_Time = 20.0
#################################################
#################################################
## Other parameters (testing)
## Only influence the Equation_Class = "BSSN-EScalar" case
FR_a2 = 3.0 ## f(R) = R + a2 * R^2
FR_l2 = 10000.0
FR_phi0 = 0.00005
FR_r0 = 120.0
FR_sigma0 = 8.0
FR_Choice = 2 ## Choice options: 1 2 3 4 5
## 1: phi(r) = phi0 * Exp(-(r-r0)**2/sigma0)
## V(r) = 0
## 2: phi(r) = phi0 * a2^2/(1+a2^2)
## V(r) = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
## 3: Schrodinger-Newton gived by system phi(r)
## V(r) = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
## 4: phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma0) - tanh((r-r0)/sigma0) )
## V(r) = 0
## f(R) = R + a2*R^2 with a2 = +oo
## 5: phi(r) = phi0 * Exp(-(r-r0)**2/sigma)
## V(r) = 0
#################################################
#################################################
## Other parameters (testing)
## (please do not change if not necessary)
boundary_choice = "BAM-choice" ## Sommerfeld boundary condition : choose "BAM-choice" or "Shibata-choice"
## prefer "BAM-choice"
gauge_choice = 0 ## gauge choice
## 0: B^i gauge
## 1: David's puncture gauge
## 2: MB B^i gauge
## 3: RIT B^i gauge
## 4: MB beta gauge
## 5: RIT beta gauge
## 6: MGB1 B^i gauge
## 7: MGB2 B^i gauge
## prefer 0 or 1
tetrad_type = 2 ## tetradtype
## v:r; u: phi; w: theta
## v^a = (x,y,z)
## 0: orthonormal order: v,u,w
## v^a = (x,y,z)
## m = (phi - i theta)/sqrt(2)
## following Frans, Eq.(8) of PRD 75, 124018(2007)
## 1: orthonormal order: w,u,v
## m = (theta + i phi)/sqrt(2)
## following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
## 2: orthonormal order: v,u,w
## v_a = (x,y,z)
## m = (phi - i theta)/sqrt(2)
## following Frans, Eq.(8) of PRD 75, 124018(2007)
## this version recommend set to 2
## prefer 2
#################################################

224
AMSS_NCKU_MiniProgram.py Normal file
View File

@@ -0,0 +1,224 @@
##################################################################
##
## AMSS-NCKU Numerical Relativity Mini Test Program
## Author: Assistant (based on Xiaoqu's code)
## 2026/01/20
##
## This script runs a scaled-down version of the GW150914 test case
## suitable for laptop testing.
##
##################################################################
import os
import shutil
import sys
import time
# --- Context Manager for Input File Swapping ---
class InputFileSwapper:
def __init__(self, mini_file="AMSS_NCKU_Input_Mini.py", target_file="AMSS_NCKU_Input.py"):
self.mini_file = mini_file
self.target_file = target_file
self.backup_file = target_file + ".bak"
self.swapped = False
def __enter__(self):
print(f"[MiniProgram] Swapping {self.target_file} with {self.mini_file}...")
if os.path.exists(self.target_file):
shutil.move(self.target_file, self.backup_file)
shutil.copy(self.mini_file, self.target_file)
self.swapped = True
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.swapped:
print(f"[MiniProgram] Restoring original {self.target_file}...")
os.remove(self.target_file)
if os.path.exists(self.backup_file):
shutil.move(self.backup_file, self.target_file)
def main():
# Use the swapper to ensure all imported modules see the mini configuration
with InputFileSwapper():
# Import modules AFTER swapping input file
try:
import AMSS_NCKU_Input as input_data
import print_information
import setup
import numerical_grid
import generate_macrodef
import makefile_and_run
import generate_TwoPuncture_input
import renew_puncture_parameter
import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu
except ImportError as e:
print(f"Error importing modules: {e}")
return
print_information.print_program_introduction()
print("\n" + "#"*60)
print(" RUNNING MINI TEST CASE: GW150914-mini")
print("#"*60 + "\n")
# --- Directory Setup ---
File_directory = os.path.join(input_data.File_directory)
if os.path.exists(File_directory):
print(f" Output directory '{File_directory}' exists. Removing for mini test...")
shutil.rmtree(File_directory, ignore_errors=True)
os.mkdir(File_directory)
shutil.copy("AMSS_NCKU_Input.py", File_directory) # Copies the current (mini) input
output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
os.mkdir(output_directory)
binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
os.mkdir(binary_results_directory)
figure_directory = os.path.join(File_directory, "figure")
os.mkdir(figure_directory)
print(" Output directories generated.\n")
# --- Setup and Input Generation ---
setup.print_input_data(File_directory)
setup.generate_AMSSNCKU_input()
setup.print_puncture_information()
print("\n Generating AMSS-NCKU input parfile...")
numerical_grid.append_AMSSNCKU_cgh_input()
print("\n Plotting initial grid...")
numerical_grid.plot_initial_grid()
print("\n Generating macro files...")
generate_macrodef.generate_macrodef_h()
generate_macrodef.generate_macrodef_fh()
# --- Compilation Preparation ---
print("\n Preparing to compile and run...")
AMSS_NCKU_source_path = "AMSS_NCKU_source"
AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
if not os.path.exists(AMSS_NCKU_source_path):
print(" Error: AMSS_NCKU_source not found! Please run in the project root.")
return
shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
# --- Compilation ---
cwd = os.getcwd()
os.chdir(AMSS_NCKU_source_copy)
print(" Compiling ABE...")
makefile_and_run.makefile_ABE()
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
print(" Compiling TwoPunctureABE...")
makefile_and_run.makefile_TwoPunctureABE()
os.chdir(cwd)
# --- Copy Executables ---
if (input_data.GPU_Calculation == "no"):
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
else:
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
if not os.path.exists(ABE_file):
print(" Error: ABE executable compilation failed.")
return
shutil.copy2(ABE_file, output_directory)
TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
if not os.path.exists(TwoPuncture_file):
print(" Error: TwoPunctureABE compilation failed.")
return
shutil.copy2(TwoPuncture_file, output_directory)
# --- Execution ---
start_time = time.time()
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
print("\n Generating TwoPuncture input...")
generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
print(" Running TwoPunctureABE...")
os.chdir(output_directory)
makefile_and_run.run_TwoPunctureABE()
os.chdir(cwd)
# Update Puncture Parameter
renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
print("\n Input files ready. Launching ABE...")
os.chdir(output_directory)
makefile_and_run.run_ABE()
os.chdir(cwd)
end_time = time.time()
elapsed_time = end_time - start_time
# --- Post-processing ---
print("\n Copying output files for inspection...")
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
if os.path.exists(AMSS_NCKU_error_file_path):
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
if os.path.exists(AMSS_NCKU_error_file_path):
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
for fname in ["bssn_BH.dat", "bssn_ADMQs.dat", "bssn_psi4.dat", "bssn_constraint.dat"]:
fpath = os.path.join(binary_results_directory, fname)
if os.path.exists(fpath):
shutil.copy(fpath, os.path.join(output_directory, fname))
# --- Plotting ---
print("\n Plotting results...")
try:
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
for i in range(input_data.Detector_Number):
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
for i in range(input_data.Detector_Number):
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
for i in range(input_data.grid_level):
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
except Exception as e:
print(f"Warning: Plotting failed: {e}")
print(f"\n Program Cost = {elapsed_time:.2f} Seconds \n")
print(" AMSS-NCKU-Python simulation finished (Mini Test).\n")
if __name__ == "__main__":
main()

View File

@@ -8,14 +8,6 @@
## ##
################################################################## ##################################################################
## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script, spawning exponentially more
## workers (fork bomb).
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
################################################################## ##################################################################
@@ -66,8 +58,7 @@ if os.path.exists(File_directory):
## Prompt whether to overwrite the existing directory ## Prompt whether to overwrite the existing directory
while True: while True:
try: try:
## inputvalue = input() inputvalue = input()
inputvalue = "continue"
## If the user agrees to overwrite, proceed and remove the existing directory ## If the user agrees to overwrite, proceed and remove the existing directory
if ( inputvalue == "continue" ): if ( inputvalue == "continue" ):
print( " Continue the calculation !!! " ) print( " Continue the calculation !!! " )
@@ -433,31 +424,26 @@ print(
import plot_xiaoqu import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory ## Plot black hole trajectory
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time ## Plot black hole separation vs. time
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude) ## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) ) plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution ## Plot ADM mass evolution
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time ## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level): for i in range(input_data.grid_level):
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
run_plot_tasks_parallel(plot_tasks)
## Plot stored binary data ## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory ) plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )

View File

@@ -341,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
double *Shellf, int Symmetry) double *Shellf, int Symmetry)
{ {
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
int myrank, nprocs; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int ordn = 2 * ghost_width; int ordn = 2 * ghost_width;
MyList<var> *varl; MyList<var> *varl;
@@ -355,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next; varl = varl->next;
} }
memset(Shellf, 0, sizeof(double) * NN * num_var); double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j // we use weight to monitor code, later some day we can move it for optimization
// All ranks traverse the same block list so they all agree on ownership int *weight;
int *owner_rank; weight = new int[NN];
owner_rank = new int[NN]; memset(weight, 0, sizeof(int) * NN);
for (int j = 0; j < NN; j++)
owner_rank[j] = -1; double *DH, *llb, *uub;
DH = new double[dim];
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i); DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -398,6 +403,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
bool flag = true; bool flag = true;
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{ {
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex #ifdef Vertex
#ifdef Cell #ifdef Cell
#error Both Cell and Vertex are defined #error Both Cell and Vertex are defined
@@ -422,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag) if (flag)
{ {
notfind = false; notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) if (myrank == BP->rank)
{ {
//---> interpolation //---> interpolation
@@ -430,11 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0; int k = 0;
while (varl) // run along variables while (varl) // run along variables
{ {
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;
} }
weight[j] = 1;
} }
} }
if (Bp == ble) if (Bp == ble)
@@ -442,389 +455,104 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next; Bp = Bp->next;
} }
} }
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group. MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
// Since each point's data is non-zero only on the owner rank, int *Weight;
// Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper. Weight = new int[NN];
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
// misc::tillherecheck("print me");
for (int i = 0; i < NN; i++)
{ {
int j = 0; if (Weight[i] > 1)
while (j < NN)
{ {
int cur_owner = owner_rank[j]; if (myrank == 0)
if (cur_owner < 0) cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
{ for (int j = 0; j < num_var; j++)
if (myrank == 0) Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; d++)
{
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
j++;
continue;
}
// Find contiguous run of points with the same owner
int jstart = j;
while (j < NN && owner_rank[j] == cur_owner)
j++;
int count = (j - jstart) * num_var;
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
} }
} else if (Weight[i] == 0 && myrank == 0)
delete[] owner_rank;
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer)
{
// Targeted point-to-point overload: each owner sends each point only to
// the one rank that needs it for integration (consumer), reducing
// communication volume by ~nprocs times compared to the Bcast version.
/*
double t_calc_end, t_calc_total = 0;
double t_calc_start = MPI_Wtime();*/
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int ordn = 2 * ghost_width;
MyList<var> *varl;
int num_var = 0;
varl = VarList;
while (varl)
{
num_var++;
varl = varl->next;
}
memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j
int *owner_rank;
owner_rank = new int[NN];
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
{
double pox[dim];
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
MyList<Block> *Bp = blb;
bool notfind = true;
while (notfind && Bp)
{
Block *BP = Bp->data;
bool flag = true;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{
flag = false;
break;
}
}
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
/*
t_calc_end = MPI_Wtime();
t_calc_total = t_calc_end - t_calc_start;*/
// --- Error check for unfound points ---
for (int j = 0; j < NN; j++)
{
if (owner_rank[j] < 0 && myrank == 0)
{ {
cout << "ERROR: Patch::Interp_Points fails to find point ("; cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << XX[d][j]; cout << XX[j][i];
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")"; cout << ")";
} }
cout << " on Patch ("; cout << " on Patch (";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << bbox[d] << "+" << lli[d] * DH[d]; cout << bbox[j] << "+" << lli[j] * getdX(j);
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")--"; cout << ")--";
} }
cout << "("; cout << "(";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << bbox[dim + d] << "-" << uui[d] * DH[d]; cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")" << endl; cout << ")" << endl;
} }
#if 0
checkBlock();
#else
cout << "splited domains:" << endl;
{
MyList<Block> *Bp = blb;
while (Bp)
{
Block *BP = Bp->data;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout << "(";
for (int j = 0; j < dim; j++)
{
cout << llb[j] << ":" << uub[j];
if (j < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
// --- Targeted point-to-point communication phase --- delete[] shellf;
// Compute consumer_rank[j] using the same deterministic formula as surface_integral delete[] weight;
int *consumer_rank = new int[NN]; delete[] Weight;
{ delete[] DH;
int mp = NN / nprocs; delete[] llb;
int Lp = NN - nprocs * mp; delete[] uub;
for (int j = 0; j < NN; j++)
{
if (j < Lp * (mp + 1))
consumer_rank[j] = j / (mp + 1);
else
consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
}
}
// Count sends and recvs per rank
int *send_count = new int[nprocs];
int *recv_count = new int[nprocs];
memset(send_count, 0, sizeof(int) * nprocs);
memset(recv_count, 0, sizeof(int) * nprocs);
for (int j = 0; j < NN; j++)
{
int own = owner_rank[j];
int con = consumer_rank[j];
if (own == con)
continue; // local — no communication needed
if (own == myrank)
send_count[con]++;
if (con == myrank)
recv_count[own]++;
}
// Build send buffers: for each destination rank, pack (index, data) pairs
// Each entry: 1 int (point index j) + num_var doubles
int total_send = 0, total_recv = 0;
int *send_offset = new int[nprocs];
int *recv_offset = new int[nprocs];
for (int r = 0; r < nprocs; r++)
{
send_offset[r] = total_send;
total_send += send_count[r];
recv_offset[r] = total_recv;
total_recv += recv_count[r];
}
// Pack send buffers: each message contains (j, data[0..num_var-1]) per point
int stride = 1 + num_var; // 1 double for index + num_var doubles for data
double *sendbuf = new double[total_send * stride];
double *recvbuf = new double[total_recv * stride];
// Temporary counters for packing
int *pack_pos = new int[nprocs];
memset(pack_pos, 0, sizeof(int) * nprocs);
for (int j = 0; j < NN; j++)
{
int own = owner_rank[j];
int con = consumer_rank[j];
if (own != myrank || con == myrank)
continue;
int pos = (send_offset[con] + pack_pos[con]) * stride;
sendbuf[pos] = (double)j; // point index
for (int v = 0; v < num_var; v++)
sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
pack_pos[con]++;
}
// Post non-blocking recvs and sends
int n_req = 0;
for (int r = 0; r < nprocs; r++)
{
if (recv_count[r] > 0) n_req++;
if (send_count[r] > 0) n_req++;
}
MPI_Request *reqs = new MPI_Request[n_req];
int req_idx = 0;
for (int r = 0; r < nprocs; r++)
{
if (recv_count[r] > 0)
{
MPI_Irecv(recvbuf + recv_offset[r] * stride,
recv_count[r] * stride, MPI_DOUBLE,
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
}
}
for (int r = 0; r < nprocs; r++)
{
if (send_count[r] > 0)
{
MPI_Isend(sendbuf + send_offset[r] * stride,
send_count[r] * stride, MPI_DOUBLE,
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
}
}
if (n_req > 0)
MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
// Unpack recv buffers into Shellf
for (int i = 0; i < total_recv; i++)
{
int pos = i * stride;
int j = (int)recvbuf[pos];
for (int v = 0; v < num_var; v++)
Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
}
delete[] reqs;
delete[] sendbuf;
delete[] recvbuf;
delete[] pack_pos;
delete[] send_offset;
delete[] recv_offset;
delete[] send_count;
delete[] recv_count;
delete[] consumer_rank;
delete[] owner_rank;
/*
// 4. 汇总并输出真正干活最慢的 Top 4
struct RankStats {
int rank;
double calc_time; // 净计算时间
};
// 创建当前进程的统计数据
RankStats local_stat;
local_stat.rank = myrank;
local_stat.calc_time = t_calc_total;
// 为所有进程的统计数据分配内存
RankStats *all_stats = nullptr;
if (myrank == 0) {
all_stats = new RankStats[nprocs];
}
// 使用MPI_Gather收集所有进程的数据到rank 0
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
all_stats, sizeof(RankStats), MPI_BYTE,
0, MPI_COMM_WORLD);
// 准备输出前4个rank的信息所有rank都参与确保广播后一致
int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
int num_top10 = 0;
if (myrank == 0) {
// 按 calc_time净计算时间排序
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
return a.calc_time > b.calc_time;
});
// 取前4个
num_top10 = (nprocs < 10) ? nprocs : 10;
for (int i = 0; i < num_top10; i++) {
top10_ranks[i] = all_stats[i].rank;
top10_times[i] = all_stats[i].calc_time;
}
printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
for (int i = 0; i < num_top10; i++) {
printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
}
// 清理分配的内存
delete[] all_stats;
}
// 广播前4个rank的信息给所有进程
MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (num_top10 > 0) {
MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
*/
} }
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
@@ -845,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next; varl = varl->next;
} }
memset(Shellf, 0, sizeof(double) * NN * num_var); double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] stores the global rank that owns point j // we use weight to monitor code, later some day we can move it for optimization
int *owner_rank; int *weight;
owner_rank = new int[NN]; weight = new int[NN];
for (int j = 0; j < NN; j++) memset(weight, 0, sizeof(int) * NN);
owner_rank[j] = -1;
// Build global-to-local rank translation for Comm_here double *DH, *llb, *uub;
MPI_Group world_group, local_group; DH = new double[dim];
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group);
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i); DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -892,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
bool flag = true; bool flag = true;
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{ {
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex #ifdef Vertex
#ifdef Cell #ifdef Cell
#error Both Cell and Vertex are defined #error Both Cell and Vertex are defined
@@ -916,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag) if (flag)
{ {
notfind = false; notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) if (myrank == BP->rank)
{ {
//---> interpolation //---> interpolation
@@ -924,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0; int k = 0;
while (varl) // run along variables while (varl) // run along variables
{ {
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;
} }
weight[j] = 1;
} }
} }
if (Bp == ble) if (Bp == ble)
@@ -937,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
// Collect unique global owner ranks and translate to local ranks in Comm_here MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
// Then broadcast each owner's points via MPI_Bcast on Comm_here int *Weight;
{ Weight = new int[NN];
int j = 0; MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
while (j < NN)
{
int cur_owner_global = owner_rank[j];
if (cur_owner_global < 0)
{
// Point not found — skip (error check disabled for sub-communicator levels)
j++;
continue;
}
// Translate global rank to local rank in Comm_here
int cur_owner_local;
MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
// Find contiguous run of points with the same owner // misc::tillherecheck("print me");
int jstart = j; // if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
while (j < NN && owner_rank[j] == cur_owner_global)
j++; for (int i = 0; i < NN; i++)
int count = (j - jstart) * num_var; {
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here); if (Weight[i] > 1)
{
if (lmyrank == 0)
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
for (int j = 0; j < num_var; j++)
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
} }
#if 0 // for not involved levels, this may fail
else if(Weight[i] == 0 && lmyrank == 0)
{
cout<<"ERROR: Patch::Interp_Points fails to find point (";
for(int j=0;j<dim;j++)
{
cout<<XX[j][i];
if(j<dim-1) cout<<",";
else cout<<")";
}
cout<<" on Patch (";
for(int j=0;j<dim;j++)
{
cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")--";
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
#if 0
checkBlock();
#else
cout<<"splited domains:"<<endl;
{
MyList<Block> *Bp=blb;
while(Bp)
{
Block *BP=Bp->data;
for(int i=0;i<dim;i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +(ghost_width-0.5)*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +ghost_width*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<llb[j]<<":"<<uub[j];
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
if(Bp == ble) break;
Bp=Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD,1);
}
#endif
} }
MPI_Group_free(&world_group); delete[] shellf;
MPI_Group_free(&local_group); delete[] weight;
delete[] owner_rank; delete[] Weight;
delete[] DH;
delete[] llb;
delete[] uub;
} }
void Patch::checkBlock() void Patch::checkBlock()
{ {

View File

@@ -39,10 +39,6 @@ public:
bool Find_Point(double *XX); bool Find_Point(double *XX);
void Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer);
void Interp_Points(MyList<var> *VarList, void Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -24,7 +24,6 @@ using namespace std;
#endif #endif
#include <mpi.h> #include <mpi.h>
#include <memory.h>
#include "MyList.h" #include "MyList.h"
#include "Block.h" #include "Block.h"
#include "Parallel.h" #include "Parallel.h"

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
#include <cmath> #include <cmath>
#include <new> #include <new>
using namespace std; using namespace std;
#include <memory.h>
#include "Parallel_bam.h" #include "Parallel_bam.h"
#include "var.h" #include "var.h"
#include "MPatch.h" #include "MPatch.h"
@@ -32,16 +32,6 @@ namespace Parallel
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape); int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig,
Patch* PP, int r_left, int r_right,
int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block);
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
int block_id, int ingfsi, int fngfsi, int lev);
void KillBlocks(MyList<Patch> *PatchLIST); void KillBlocks(MyList<Patch> *PatchLIST);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z)); void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -91,43 +81,6 @@ namespace Parallel
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
struct SyncCache {
bool valid;
int cpusize;
MyList<gridseg> **combined_src;
MyList<gridseg> **combined_dst;
int *send_lengths;
int *recv_lengths;
double **send_bufs;
double **recv_bufs;
int *send_buf_caps;
int *recv_buf_caps;
MPI_Request *reqs;
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
SyncCache();
void invalidate();
void destroy();
};
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
struct AsyncSyncState {
int req_no;
bool active;
AsyncSyncState() : req_no(0), active(false) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);
@@ -140,15 +93,6 @@ namespace Parallel
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL, void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);
void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void Prolong(Patch *Patc, Patch *Patf, void Prolong(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);
@@ -218,18 +162,6 @@ namespace Parallel
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi, MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0); bool periodic, int start_rank, int end_rank, int nodes = 0);
// Redistribute blocks with time statistics for load balancing
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif #endif
}
// Dynamic load balancing: split blocks for heavy ranks #endif /*PARALLEL_H */
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
int split_factor, int cpusize, int ingfsi, int fngfsi);
// Check if load balancing is needed based on interpolation times
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
}
#endif /*PARALLEL_H */

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,7 @@
#ifndef TWO_PUNCTURES_H #ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H #define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19 #define StencilSize 19
#define N_PlaneRelax 1 #define N_PlaneRelax 1
#define NRELAX 200 #define NRELAX 200
@@ -43,18 +42,6 @@ private:
int ntotal; int ntotal;
// ===== Precomputed spectral derivative matrices =====
double *D1_A, *D2_A;
double *D1_B, *D2_B;
double *DF1_phi, *DF2_phi;
// ===== Pre-allocated workspace for LineRelax (per-thread) =====
int max_threads;
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
struct parameters struct parameters
{ {
int nvar, n1, n2, n3; int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
int Newtonmaxit); int Newtonmaxit);
~TwoPunctures(); ~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve(); void Solve();
void set_initial_guess(derivs v); void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d); int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
double BY_KKofxyz(double x, double y, double z); double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values); int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R, void LinEquations(double A, double B, double X, double R,
double x, double r, double phi, double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values); double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname); void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu) // provided by Vasileios Paschalidis (vpaschal@illinois.edu)

View File

@@ -730,12 +730,6 @@ void bssn_class::Initialize()
PhysTime = StartTime; PhysTime = StartTime;
Setup_Black_Hole_position(); Setup_Black_Hole_position();
} }
// Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
} }
//================================================================================================ //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
delete Azzz; delete Azzz;
#endif #endif
// Destroy sync caches before GH
if (sync_cache_pre)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_pre[i].destroy();
delete[] sync_cache_pre;
}
if (sync_cache_cor)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_cor[i].destroy();
delete[] sync_cache_cor;
}
if (sync_cache_rp_coarse)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse;
}
if (sync_cache_rp_fine)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine;
}
delete GH; delete GH;
#ifdef WithShell #ifdef WithShell
delete SH; delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2608,7 +2574,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2775,7 +2740,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2790,7 +2754,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2809,7 +2772,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2825,7 +2787,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3369,23 +3353,6 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#"
<< iter_count << " variables at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3729,25 +3723,6 @@ void bssn_class::Step(int lev, int YN)
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4223,26 +4222,8 @@ void bssn_class::Step(int lev, int YN)
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
#endif #endif
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4572,25 +4578,6 @@ void bssn_class::Step(int lev, int YN)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,21 +5140,11 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
@@ -5185,6 +5158,12 @@ void bssn_class::Step(int lev, int YN)
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
#endif #endif
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5490,19 +5479,6 @@ void bssn_class::SHStep()
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// corrector // corrector
for (iter_count = 1; iter_count < 4; iter_count++) for (iter_count = 1; iter_count < 4; iter_count++)
{ {
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5667,20 +5653,6 @@ void bssn_class::SHStep()
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
sPp = SH->PatL; sPp = SH->PatL;
while (sPp) while (sPp)
{ {
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
} }
} }
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
} }
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
else // no time refinement levels and for all same time levels else // no time refinement levels and for all same time levels
{ {
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#else #else
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
#undef MIXOUTB #undef MIXOUTB

View File

@@ -126,11 +126,6 @@ public:
MyList<var> *OldStateList, *DumpList; MyList<var> *OldStateList, *DumpList;
MyList<var> *ConstraintList; MyList<var> *ConstraintList;
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;

View File

@@ -61,7 +61,9 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong ! gont = 0: success; gont = 1: something wrong
integer::gont integer::gont,i,j,k
real*8 :: val1, val2
real*8 :: det, t_gupxx, t_gupxy, t_gupxz, t_gupyy, t_gupyz, t_gupzz
!~~~~~~> Other variables: !~~~~~~> Other variables:
@@ -84,7 +86,10 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI real*8 :: PI
#if (DEBUG_NAN_CHECK)
real*8 :: dX
#endif
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0 real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0 real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0 real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
@@ -106,8 +111,8 @@
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
#endif #endif
!!! sanity check (disabled in production builds for performance) #if (DEBUG_NAN_CHECK)
#ifdef DEBUG !!! sanity check
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) &
@@ -141,10 +146,6 @@
PI = dacos(-ONE) PI = dacos(-ONE)
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
alpn1 = Lap + ONE alpn1 = Lap + ONE
chin1 = chi + ONE chin1 = chi + ONE
gxx = dxx + ONE gxx = dxx + ONE
@@ -158,15 +159,15 @@
div_beta = betaxx + betayy + betazz div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
@@ -192,71 +193,99 @@
gyz * betayx + gzz * betazx & gyz * betayx + gzz * betazx &
- gxz * betayy !rhs for gij - gxz * betayy !rhs for gij
! invert tilted metric ! fused loop for metric inversion and connections
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & !DIR$ SIMD
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz do k=1,ex(3)
gupxx = ( gyy * gzz - gyz * gyz ) / gupzz do j=1,ex(2)
gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz do i=1,ex(1)
gupxz = ( gxy * gyz - gyy * gxz ) / gupzz ! 1. Metric Inversion
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz det = ONE / ( &
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) )
if(co == 0)then t_gupxx = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) * det
! Gam^i_Res = Gam^i + gup^ij_,j t_gupxy = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) * det
Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)& t_gupxz = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) * det
+gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)& t_gupyy = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) * det
+gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)& t_gupyz = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) * det
+gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)& t_gupzz = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) * det
+gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
endif
! second kind of connection gupxx(i,j,k) = t_gupxx
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) gupxy(i,j,k) = t_gupxy
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) gupxz(i,j,k) = t_gupxz
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) gupyy(i,j,k) = t_gupyy
gupyz(i,j,k) = t_gupyz
gupzz(i,j,k) = t_gupzz
Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz )) if(co == 0)then
Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz )) Gmx_Res(i,j,k) = Gamx(i,j,k) - (t_gupxx*(t_gupxx*gxxx(i,j,k)+t_gupxy*gxyx(i,j,k)+t_gupxz*gxzx(i,j,k))&
Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz )) +t_gupxy*(t_gupxx*gxyx(i,j,k)+t_gupxy*gyyx(i,j,k)+t_gupxz*gyzx(i,j,k))&
+t_gupxz*(t_gupxx*gxzx(i,j,k)+t_gupxy*gyzx(i,j,k)+t_gupxz*gzzx(i,j,k))&
+t_gupxx*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
+t_gupxy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
+t_gupxz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
+t_gupxx*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+t_gupxy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+t_gupxz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
Gmy_Res(i,j,k) = Gamy(i,j,k) - (t_gupxx*(t_gupxy*gxxx(i,j,k)+t_gupyy*gxyx(i,j,k)+t_gupyz*gxzx(i,j,k))&
+t_gupxy*(t_gupxy*gxyx(i,j,k)+t_gupyy*gyyx(i,j,k)+t_gupyz*gyzx(i,j,k))&
+t_gupxz*(t_gupxy*gxzx(i,j,k)+t_gupyy*gyzx(i,j,k)+t_gupyz*gzzx(i,j,k))&
+t_gupxy*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
+t_gupyy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
+t_gupyz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
+t_gupxy*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+t_gupyy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+t_gupyz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
Gmz_Res(i,j,k) = Gamz(i,j,k) - (t_gupxx*(t_gupxz*gxxx(i,j,k)+t_gupyz*gxyx(i,j,k)+t_gupzz*gxzx(i,j,k))&
+t_gupxy*(t_gupxz*gxyx(i,j,k)+t_gupyz*gyyx(i,j,k)+t_gupzz*gyzx(i,j,k))&
+t_gupxz*(t_gupxz*gxzx(i,j,k)+t_gupyz*gyzx(i,j,k)+t_gupzz*gzzx(i,j,k))&
+t_gupxy*(t_gupxz*gxxy(i,j,k)+t_gupyz*gxyy(i,j,k)+t_gupzz*gxzy(i,j,k))&
+t_gupyy*(t_gupxz*gxyy(i,j,k)+t_gupyz*gyyy(i,j,k)+t_gupzz*gyzy(i,j,k))&
+t_gupyz*(t_gupxz*gxzy(i,j,k)+t_gupyz*gyzy(i,j,k)+t_gupzz*gzzy(i,j,k))&
+t_gupxz*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+t_gupyz*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+t_gupzz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
endif
Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz) ! 2. Christoffel Symbols
Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz) val1 = TWO * gxyx(i,j,k) - gxxy(i,j,k)
Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz) val2 = TWO * gxzx(i,j,k) - gxxz(i,j,k)
Gamxxx(i,j,k) =HALF*( t_gupxx*gxxx(i,j,k) + t_gupxy*val1 + t_gupxz*val2 )
Gamyxx(i,j,k) =HALF*( t_gupxy*gxxx(i,j,k) + t_gupyy*val1 + t_gupyz*val2 )
Gamzxx(i,j,k) =HALF*( t_gupxz*gxxx(i,j,k) + t_gupyz*val1 + t_gupzz*val2 )
Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) ) val1 = TWO * gxyy(i,j,k) - gyyx(i,j,k)
Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) ) val2 = TWO * gyzy(i,j,k) - gyyz(i,j,k)
Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) ) Gamxyy(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyy(i,j,k) + t_gupxz*val2 )
Gamyyy(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyy(i,j,k) + t_gupyz*val2 )
Gamzyy(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyy(i,j,k) + t_gupzz*val2 )
val1 = TWO * gxzz(i,j,k) - gzzx(i,j,k)
val2 = TWO * gyzz(i,j,k) - gzzy(i,j,k)
Gamxzz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*val2 + t_gupxz*gzzz(i,j,k) )
Gamyzz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*val2 + t_gupyz*gzzz(i,j,k) )
Gamzzz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*val2 + t_gupzz*gzzz(i,j,k) )
val1 = gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)
Gamxxy(i,j,k) =HALF*( t_gupxx*gxxy(i,j,k) + t_gupxy*gyyx(i,j,k) + t_gupxz*val1 )
Gamyxy(i,j,k) =HALF*( t_gupxy*gxxy(i,j,k) + t_gupyy*gyyx(i,j,k) + t_gupyz*val1 )
Gamzxy(i,j,k) =HALF*( t_gupxz*gxxy(i,j,k) + t_gupyz*gyyx(i,j,k) + t_gupzz*val1 )
val1 = gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)
Gamxxz(i,j,k) =HALF*( t_gupxx*gxxz(i,j,k) + t_gupxy*val1 + t_gupxz*gzzx(i,j,k) )
Gamyxz(i,j,k) =HALF*( t_gupxy*gxxz(i,j,k) + t_gupyy*val1 + t_gupyz*gzzx(i,j,k) )
Gamzxz(i,j,k) =HALF*( t_gupxz*gxxz(i,j,k) + t_gupyz*val1 + t_gupzz*gzzx(i,j,k) )
val1 = gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)
Gamxyz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyz(i,j,k) + t_gupxz*gzzy(i,j,k) )
Gamyyz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyz(i,j,k) + t_gupyz*gzzy(i,j,k) )
Gamzyz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyz(i,j,k) + t_gupzz*gzzy(i,j,k) )
enddo
enddo
enddo
Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
! Raise indices of \tilde A_{ij} and store in R_ij ! Raise indices of \tilde A_{ij} and store in R_ij
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + & Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
@@ -287,30 +316,40 @@
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
! reuse fxx/fxy/fxz as temporaries for matter-source combinations
fxx = F2o3 * Kx + EIGHT * PI * Sx
fxy = F2o3 * Ky + EIGHT * PI * Sy
fxz = F2o3 * Kz + EIGHT * PI * Sz
! reuse Gamxa/Gamya/Gamza as temporaries for chix*R combinations
Gamxa = chix * Rxx + chiy * Rxy + chiz * Rxz
Gamya = chix * Rxy + chiy * Ryy + chiz * Ryz
Gamza = chix * Rxz + chiy * Ryz + chiz * Rzz
Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + & Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + &
TWO * alpn1 * ( & TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - & -F3o2 * ONE/chin1 * Gamxa - &
gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - & gupxx * fxx - &
gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & gupxy * fxy - &
gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & gupxz * fxz + &
Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + & Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + &
TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) ) TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + & Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + &
TWO * alpn1 * ( & TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - & -F3o2 * ONE/chin1 * Gamya - &
gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - & gupxy * fxx - &
gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - & gupyy * fxy - &
gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & gupyz * fxz + &
Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + & Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + &
TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) ) TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + & Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + &
TWO * alpn1 * ( & TWO * alpn1 * ( &
-F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - & -F3o2 * ONE/chin1 * Gamza - &
gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - & gupxz * fxx - &
gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - & gupyz * fxy - &
gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + & gupzz * fxz + &
Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + & Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + &
TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) ) TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
@@ -612,47 +651,47 @@
fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f ! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + & f = gupxx * ( fxx - F3o2 * ONE/chin1 * chix * chix ) + &
gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + & gupyy * ( fyy - F3o2 * ONE/chin1 * chiy * chiy ) + &
gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + & gupzz * ( fzz - F3o2 * ONE/chin1 * chiz * chiz ) + &
TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + & TWO * gupxy * ( fxy - F3o2 * ONE/chin1 * chix * chiy ) + &
TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + & TWO * gupxz * ( fxz - F3o2 * ONE/chin1 * chix * chiz ) + &
TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) TWO * gupyz * ( fyz - F3o2 * ONE/chin1 * chiy * chiz )
! Add chi part to Ricci tensor: ! Add chi part to Ricci tensor:
Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO Rxx = Rxx + (fxx - chix*chix*ONE/chin1*HALF + gxx * f) * ONE/chin1 * HALF
Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO Ryy = Ryy + (fyy - chiy*chiy*ONE/chin1*HALF + gyy * f) * ONE/chin1 * HALF
Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO Rzz = Rzz + (fzz - chiz*chiz*ONE/chin1*HALF + gzz * f) * ONE/chin1 * HALF
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxy = Rxy + (fxy - chix*chiy*ONE/chin1*HALF + gxy * f) * ONE/chin1 * HALF
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Rxz = Rxz + (fxz - chix*chiz*ONE/chin1*HALF + gxz * f) * ONE/chin1 * HALF
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO Ryz = Ryz + (fyz - chiy*chiz*ONE/chin1*HALF + gyz * f) * ONE/chin1 * HALF
! covariant second derivatives of the lapse respect to physical metric ! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev) SYM,SYM,SYM,symmetry,Lev)
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz) * ONE/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz) * ONE/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz) * ONE/chin1
! now get physical second kind of connection ! now get physical second kind of connection
Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF Gamxxx = Gamxxx - ( TWO * chix * ONE/chin1 - gxx * gxxx )*HALF
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF
Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF
Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF Gamyyy = Gamyyy - ( TWO * chiy * ONE/chin1 - gyy * gxxy )*HALF
Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF
Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF
Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF
Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF Gamzzz = Gamzzz - ( TWO * chiz * ONE/chin1 - gzz * gxxz )*HALF
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF Gamxxy = Gamxxy - ( chiy * ONE/chin1 - gxy * gxxx )*HALF
Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF Gamyxy = Gamyxy - ( chix * ONE/chin1 - gxy * gxxy )*HALF
Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF
Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF Gamxxz = Gamxxz - ( chiz * ONE/chin1 - gxz * gxxx )*HALF
Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF
Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF Gamzxz = Gamzxz - ( chix * ONE/chin1 - gxz * gxxz )*HALF
Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF
Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF Gamyyz = Gamyyz - ( chiz * ONE/chin1 - gyz * gxxy )*HALF
Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF Gamzyz = Gamzyz - ( chiy * ONE/chin1 - gyz * gxxz )*HALF
fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
@@ -695,7 +734,7 @@
gupxz * (Axy * Azz + Ayz * Axz) + & gupxz * (Axy * Azz + Ayz * Axz) + &
gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + & f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f) TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1 * ONE/chin1 * f)
fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
@@ -815,7 +854,8 @@
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + & reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs) TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2 fxx = dsqrt(chin1)
reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
dtSfx_rhs = Gamx_rhs - reta*dtSfx dtSfx_rhs = Gamx_rhs - reta*dtSfx
dtSfy_rhs = Gamy_rhs - reta*dtSfy dtSfy_rhs = Gamy_rhs - reta*dtSfy
dtSfz_rhs = Gamz_rhs - reta*dtSfz dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -827,7 +867,7 @@
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + & reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs) TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2 reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
dtSfx_rhs = Gamx_rhs - reta*dtSfx dtSfx_rhs = Gamx_rhs - reta*dtSfx
dtSfy_rhs = Gamy_rhs - reta*dtSfy dtSfy_rhs = Gamy_rhs - reta*dtSfy
dtSfz_rhs = Gamz_rhs - reta*dtSfz dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -835,7 +875,8 @@
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + & reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs) TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2 fxx = dsqrt(chin1)
reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
betax_rhs = FF*Gamx - reta*betax betax_rhs = FF*Gamx - reta*betax
betay_rhs = FF*Gamy - reta*betay betay_rhs = FF*Gamy - reta*betay
betaz_rhs = FF*Gamz - reta*betaz betaz_rhs = FF*Gamz - reta*betaz
@@ -847,7 +888,7 @@
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + & reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs) TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2 reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
betax_rhs = FF*Gamx - reta*betax betax_rhs = FF*Gamx - reta*betax
betay_rhs = FF*Gamy - reta*betay betay_rhs = FF*Gamy - reta*betay
betaz_rhs = FF*Gamz - reta*betaz betaz_rhs = FF*Gamz - reta*betaz
@@ -945,60 +986,103 @@
SSA(2)=SYM SSA(2)=SYM
SSA(3)=ANTI SSA(3)=ANTI
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term part
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps) call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps) call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps) call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
!!
#if 1
!! bam does not apply dissipation on gauge variables
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#else
! No dissipation on gauge variables (advection only)
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS) call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
if(eps>0)then
! usual Kreiss-Oliger dissipation
call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "before",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif #endif
call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "after",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
#if 1
!! bam does not apply dissipation on gauge variables
call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
#endif
#endif
endif
if(co == 0)then if(co == 0)then
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
@@ -1036,48 +1120,48 @@
! mov_Res_j = gupkj*(-1/chi d_k chi*A_ij + D_k A_ij) - 2/3 d_j trK - 8 PI s_j where D respect to physical metric ! mov_Res_j = gupkj*(-1/chi d_k chi*A_ij + D_k A_ij) - 2/3 d_j trK - 8 PI s_j where D respect to physical metric
! store D_i A_jk - 1/chi d_i chi*A_jk in gjk_i ! store D_i A_jk - 1/chi d_i chi*A_jk in gjk_i
call fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0) call fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0) call fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0)
call fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0) call fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0)
call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0) call fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0)
call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
gxxx = gxxx - ( Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz & gxxx = gxxx - ( Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz &
+ Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx/chin1 + Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx*ONE/chin1
gxyx = gxyx - ( Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz & gxyx = gxyx - ( Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
+ Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy/chin1 + Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy*ONE/chin1
gxzx = gxzx - ( Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz & gxzx = gxzx - ( Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
+ Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz/chin1 + Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz*ONE/chin1
gyyx = gyyx - ( Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz & gyyx = gyyx - ( Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz &
+ Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy/chin1 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy*ONE/chin1
gyzx = gyzx - ( Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz & gyzx = gyzx - ( Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz &
+ Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz/chin1 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz*ONE/chin1
gzzx = gzzx - ( Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz & gzzx = gzzx - ( Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz &
+ Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz/chin1 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz*ONE/chin1
gxxy = gxxy - ( Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz & gxxy = gxxy - ( Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
+ Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx/chin1 + Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx*ONE/chin1
gxyy = gxyy - ( Gamxyy * Axx + Gamyyy * Axy + Gamzyy * Axz & gxyy = gxyy - ( Gamxyy * Axx + Gamyyy * Axy + Gamzyy * Axz &
+ Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy/chin1 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy*ONE/chin1
gxzy = gxzy - ( Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz & gxzy = gxzy - ( Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
+ Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz/chin1 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz*ONE/chin1
gyyy = gyyy - ( Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz & gyyy = gyyy - ( Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz &
+ Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy/chin1 + Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy*ONE/chin1
gyzy = gyzy - ( Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz & gyzy = gyzy - ( Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
+ Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz/chin1 + Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz*ONE/chin1
gzzy = gzzy - ( Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz & gzzy = gzzy - ( Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz &
+ Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz/chin1 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz*ONE/chin1
gxxz = gxxz - ( Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz & gxxz = gxxz - ( Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
+ Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx/chin1 + Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx*ONE/chin1
gxyz = gxyz - ( Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz & gxyz = gxyz - ( Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
+ Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy/chin1 + Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy*ONE/chin1
gxzz = gxzz - ( Gamxzz * Axx + Gamyzz * Axy + Gamzzz * Axz & gxzz = gxzz - ( Gamxzz * Axx + Gamyzz * Axy + Gamzzz * Axz &
+ Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz/chin1 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz*ONE/chin1
gyyz = gyyz - ( Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz & gyyz = gyyz - ( Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
+ Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy/chin1 + Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy*ONE/chin1
gyzz = gyzz - ( Gamxzz * Axy + Gamyzz * Ayy + Gamzzz * Ayz & gyzz = gyzz - ( Gamxzz * Axy + Gamyzz * Ayy + Gamzzz * Ayz &
+ Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz/chin1 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz*ONE/chin1
gzzz = gzzz - ( Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz & gzzz = gzzz - ( Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz &
+ Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz/chin1 + Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz*ONE/chin1
movx_Res = gupxx*gxxx + gupyy*gxyy + gupzz*gxzz & movx_Res = gupxx*gxxx + gupyy*gxyy + gupzz*gxzz &
+gupxy*gxyx + gupxz*gxzx + gupyz*gxzy & +gupxy*gxyx + gupxz*gxzx + gupyz*gxzy &
+gupxy*gxxy + gupxz*gxxz + gupyz*gxyz +gupxy*gxxy + gupxz*gxxz + gupyz*gxyz

View File

@@ -43,14 +43,6 @@ cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
end_rank = 0; end_rank = 0;
#endif #endif
// Initialize load balancing variables
enable_load_balance = false;
load_balance_check_interval = 10; // Check every 10 time steps
current_time_step = 0;
rank_interp_times = nullptr;
heavy_ranks = nullptr;
num_heavy_ranks = 0;
if (!checkrun) if (!checkrun)
{ {
read_bbox(Symmetry, filename); read_bbox(Symmetry, filename);
@@ -121,12 +113,6 @@ cgh::~cgh()
delete[] Porgls[lev]; delete[] Porgls[lev];
} }
delete[] Porgls; delete[] Porgls;
// Clean up load balancing memory
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
} }
//================================================================================================ //================================================================================================
@@ -144,7 +130,7 @@ void cgh::compose_cgh(int nprocs)
for (int lev = 0; lev < levels; lev++) for (int lev = 0; lev < levels; lev++)
{ {
checkPatchList(PatL[lev], false); checkPatchList(PatL[lev], false);
Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false); Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
#if (RPB == 1) #if (RPB == 1)
// we need distributed box of PatL[lev] and PatL[lev-1] // we need distributed box of PatL[lev] and PatL[lev-1]
if (lev > 0) if (lev > 0)
@@ -1719,121 +1705,3 @@ void cgh::settrfls(const int lev)
{ {
trfls = lev; trfls = lev;
} }
//================================================================================================
// Load Balancing Functions
//================================================================================================
// Initialize load balancing
void cgh::init_load_balance(int nprocs)
{
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
rank_interp_times = new double[nprocs];
heavy_ranks = new int[4]; // Maximum 4 heavy ranks
num_heavy_ranks = 0;
for (int i = 0; i < nprocs; i++)
rank_interp_times[i] = 0.0;
}
// Update interpolation time for a rank
void cgh::update_interp_time(int rank, double time)
{
if (rank_interp_times && rank >= 0)
{
rank_interp_times[rank] = time;
}
}
// Check and perform load balancing if needed
bool cgh::check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// Only check at specified intervals
current_time_step++;
if (current_time_step % load_balance_check_interval != 0)
return false;
if (myrank == 0)
{
cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
}
// Collect all rank times on rank 0
double *all_times = nullptr;
if (myrank == 0)
{
all_times = new double[nprocs];
}
MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
bool need_rebalance = false;
if (myrank == 0)
{
// Check if load balancing is needed
need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
if (need_rebalance)
{
cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
cout << "Top " << num_heavy_ranks << " heavy ranks: ";
for (int i = 0; i < num_heavy_ranks; i++)
{
cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
}
cout << endl;
// Analyze blocks that need to be split
Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
// Set lev_flag to trigger recompose_cgh
cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
}
else
{
cout << "=== Load is balanced, no rebalancing needed ===" << endl;
}
delete[] all_times;
}
// Broadcast the decision to all ranks
MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
if (need_rebalance)
{
// Broadcast heavy ranks information
MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
// Perform recompose_cgh on the specified level
if (myrank == 0)
{
cout << "=== Performing recompose_cgh ===" << endl;
}
// Call recompose_cgh_Onelevel for the specified level
bool *lev_flag = new bool[1];
lev_flag[0] = true;
recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
delete[] lev_flag;
// Reset time counter after rebalancing
current_time_step = 0;
return true;
}
return false;
}

View File

@@ -87,21 +87,6 @@ public:
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs); void construct_mylev(int nprocs);
#endif #endif
// Load balancing support
bool enable_load_balance; // Enable load balancing
int load_balance_check_interval; // Check interval (in time steps)
int current_time_step; // Current time step counter
double *rank_interp_times; // Store interpolation times for each rank
int *heavy_ranks; // Store heavy rank numbers
int num_heavy_ranks; // Number of heavy ranks
void init_load_balance(int nprocs);
void update_interp_time(int rank, double time);
bool check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
}; };
#endif /* CGH_H */ #endif /* CGH_H */

View File

@@ -69,8 +69,6 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -373,8 +371,6 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1943,6 +1939,309 @@
return return
end subroutine fddyz end subroutine fddyz
subroutine fderivs_batch4(ex,f1,f2,f3,f4, &
f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z,f4x,f4y,f4z, &
X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3,f4
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f4x,f4y,f4z
real*8, intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
!~~~~~~ other variables
real*8 :: dX,dY,dZ
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3,fh4
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0
real*8, parameter :: TWO=2.d0,EIT=8.d0
real*8, parameter :: F12=1.2d1
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f1,fh1,SoA)
call symmetry_bd(2,ex,f2,fh2,SoA)
call symmetry_bd(2,ex,f3,fh3,SoA)
call symmetry_bd(2,ex,f4,fh4,SoA)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
f1x = ZEO; f1y = ZEO; f1z = ZEO
f2x = ZEO; f2y = ZEO; f2z = ZEO
f3x = ZEO; f3y = ZEO; f3z = ZEO
f4x = ZEO; f4y = ZEO; f4z = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
f4x(i,j,k)=d12dx*(fh4(i-2,j,k)-EIT*fh4(i-1,j,k)+EIT*fh4(i+1,j,k)-fh4(i+2,j,k))
f4y(i,j,k)=d12dy*(fh4(i,j-2,k)-EIT*fh4(i,j-1,k)+EIT*fh4(i,j+1,k)-fh4(i,j+2,k))
f4z(i,j,k)=d12dz*(fh4(i,j,k-2)-EIT*fh4(i,j,k-1)+EIT*fh4(i,j,k+1)-fh4(i,j,k+2))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
f4x(i,j,k)=d2dx*(-fh4(i-1,j,k)+fh4(i+1,j,k))
f4y(i,j,k)=d2dy*(-fh4(i,j-1,k)+fh4(i,j+1,k))
f4z(i,j,k)=d2dz*(-fh4(i,j,k-1)+fh4(i,j,k+1))
endif
enddo
enddo
enddo
return
end subroutine fderivs_batch4
!-----------------------------------------------------------------------------
! batch first derivatives (3 fields), same symmetry setup
!-----------------------------------------------------------------------------
subroutine fderivs_batch3(ex,f1,f2,f3, &
f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z, &
X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
real*8, intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
!~~~~~~ other variables
real*8 :: dX,dY,dZ
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0
real*8, parameter :: TWO=2.d0,EIT=8.d0
real*8, parameter :: F12=1.2d1
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f1,fh1,SoA)
call symmetry_bd(2,ex,f2,fh2,SoA)
call symmetry_bd(2,ex,f3,fh3,SoA)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
f1x = ZEO; f1y = ZEO; f1z = ZEO
f2x = ZEO; f2y = ZEO; f2z = ZEO
f3x = ZEO; f3y = ZEO; f3z = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
endif
enddo
enddo
enddo
return
end subroutine fderivs_batch3
!-----------------------------------------------------------------------------
! batch first derivatives (2 fields), same symmetry setup
!-----------------------------------------------------------------------------
subroutine fderivs_batch2(ex,f1,f2, &
f1x,f1y,f1z,f2x,f2y,f2z, &
X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
real*8, intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
!~~~~~~ other variables
real*8 :: dX,dY,dZ
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0
real*8, parameter :: TWO=2.d0,EIT=8.d0
real*8, parameter :: F12=1.2d1
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f1,fh1,SoA)
call symmetry_bd(2,ex,f2,fh2,SoA)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
f1x = ZEO; f1y = ZEO; f1z = ZEO
f2x = ZEO; f2y = ZEO; f2z = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
endif
enddo
enddo
enddo
return
end subroutine fderivs_batch2
#elif (ghost_width == 4) #elif (ghost_width == 4)
! sixth order code ! sixth order code
@@ -2081,6 +2380,9 @@
end subroutine fderivs end subroutine fderivs
!----------------------------------------------------------------------------- !-----------------------------------------------------------------------------
! batch first derivatives (4 fields), same symmetry setup
!-----------------------------------------------------------------------------
!-----------------------------------------------------------------------------
! !
! single derivatives dx ! single derivatives dx
! !

View File

@@ -19,60 +19,48 @@
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
real*8 :: lgxx,lgyy,lgzz,ldetg real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: ltrA,lscale
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
lgxx = dxx(i,j,k) + ONE detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
lgyy = dyy(i,j,k) + ONE gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
lgzz = dzz(i,j,k) + ONE gupxx = ( gyy * gzz - gyz * gyz ) / detg
gupxy = - ( gxy * gzz - gyz * gxz ) / detg
gupxz = ( gxy * gyz - gyy * gxz ) / detg
gupyy = ( gxx * gzz - gxz * gxz ) / detg
gupyz = - ( gxx * gyz - gxy * gxz ) / detg
gupzz = ( gxx * gyy - gxy * gxy ) / detg
ldetg = lgxx * lgyy * lgzz & trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
- gxz(i,j,k) * lgyy * gxz(i,j,k) &
- gxy(i,j,k) * gxy(i,j,k) * lgzz &
- lgxx * gyz(i,j,k) * gyz(i,j,k)
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg Axx = Axx - F1o3 * gxx * trA
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg Axy = Axy - F1o3 * gxy * trA
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg Axz = Axz - F1o3 * gxz * trA
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg Ayy = Ayy - F1o3 * gyy * trA
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg Ayz = Ayz - F1o3 * gyz * trA
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg Azz = Azz - F1o3 * gzz * trA
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & detg = ONE / ( detg ** F1o3 )
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA gxx = gxx * detg
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA gxy = gxy * detg
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA gxz = gxz * detg
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA gyy = gyy * detg
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA gyz = gyz * detg
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA gzz = gzz * detg
lscale = ONE / ( ldetg ** F1o3 ) dxx = gxx - ONE
dyy = gyy - ONE
dxx(i,j,k) = lgxx * lscale - ONE dzz = gzz - ONE
gxy(i,j,k) = gxy(i,j,k) * lscale
gxz(i,j,k) = gxz(i,j,k) * lscale
dyy(i,j,k) = lgyy * lscale - ONE
gyz(i,j,k) = gyz(i,j,k) * lscale
dzz(i,j,k) = lgzz * lscale - ONE
enddo
enddo
enddo
return return
@@ -95,70 +83,50 @@
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA
real*8 :: lgxx,lgyy,lgzz,lscale real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgxy,lgxz,lgyz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
! for g
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
! for g: normalize determinant first gupzz = ONE / ( gupzz ** F1o3 )
lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE
lgzz = dzz(i,j,k) + ONE
lgxy = gxy(i,j,k)
lgxz = gxz(i,j,k)
lgyz = gyz(i,j,k)
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & gxx = gxx * gupzz
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & gxy = gxy * gupzz
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz gxz = gxz * gupzz
gyy = gyy * gupzz
gyz = gyz * gupzz
gzz = gzz * gupzz
lscale = ONE / ( lscale ** F1o3 ) dxx = gxx - ONE
dyy = gyy - ONE
dzz = gzz - ONE
! for A
lgxx = lgxx * lscale gupxx = ( gyy * gzz - gyz * gyz )
lgxy = lgxy * lscale gupxy = - ( gxy * gzz - gyz * gxz )
lgxz = lgxz * lscale gupxz = ( gxy * gyz - gyy * gxz )
lgyy = lgyy * lscale gupyy = ( gxx * gzz - gxz * gxz )
lgyz = lgyz * lscale gupyz = - ( gxx * gyz - gxy * gxz )
lgzz = lgzz * lscale gupzz = ( gxx * gyy - gxy * gxy )
dxx(i,j,k) = lgxx - ONE trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
gxy(i,j,k) = lgxy + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
gxz(i,j,k) = lgxz
dyy(i,j,k) = lgyy - ONE
gyz(i,j,k) = lgyz
dzz(i,j,k) = lgzz - ONE
! for A: trace-free using normalized metric (det=1, no division needed) Axx = Axx - F1o3 * gxx * trA
lgupxx = ( lgyy * lgzz - lgyz * lgyz ) Axy = Axy - F1o3 * gxy * trA
lgupxy = - ( lgxy * lgzz - lgyz * lgxz ) Axz = Axz - F1o3 * gxz * trA
lgupxz = ( lgxy * lgyz - lgyy * lgxz ) Ayy = Ayy - F1o3 * gyy * trA
lgupyy = ( lgxx * lgzz - lgxz * lgxz ) Ayz = Ayz - F1o3 * gyz * trA
lgupyz = - ( lgxx * lgyz - lgxy * lgxz ) Azz = Azz - F1o3 * gzz * trA
lgupzz = ( lgxx * lgyy - lgxy * lgxy )
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
enddo
enddo
enddo
return return

View File

@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,17 +886,14 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8) funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
enddo enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2) funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
enddo enddo
@@ -912,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -940,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1116,66 +1118,64 @@ end subroutine d2dump
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint subroutine polint(xa,ya,x,y,dy,ordn)
subroutine polint(xa, ya, x, y, dy, ordn)
implicit none implicit none
integer, intent(in) :: ordn !~~~~~~> Input Parameter:
real*8, dimension(ordn), intent(in) :: xa, ya integer,intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: xa,ya
real*8, intent(in) :: x real*8, intent(in) :: x
real*8, intent(out) :: y, dy real*8, intent(out) :: y,dy
integer :: i, m, ns, n_m !~~~~~~> Other parameter:
real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val
c = ya integer :: m,n,ns
d = ya real*8, dimension(ordn) :: c,d,den,ho
ho = xa - x real*8 :: dif,dift
ns = 1 !~~~~~~>
dif = abs(x - xa(1))
do i = 2, ordn n=ordn
dift = abs(x - xa(i)) m=ordn
if (dift < dif) then
ns = i c=ya
dif = dift d=ya
end if ho=xa-x
ns=1
dif=abs(x-xa(1))
do m=1,n
dift=abs(x-xa(m))
if(dift < dif) then
ns=m
dif=dift
end if
end do end do
y = ya(ns) y=ya(ns)
ns = ns - 1 ns=ns-1
do m=1,n-1
do m = 1, ordn - 1 den(1:n-m)=ho(1:n-m)-ho(1+m:n)
n_m = ordn - m if (any(den(1:n-m) == 0.0))then
do i = 1, n_m write(*,*) 'failure in polint for point',x
hp = ho(i) write(*,*) 'with input points: ',xa
h = ho(i+m) stop
den_val = hp - h endif
den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
if (den_val == 0.0d0) then d(1:n-m)=ho(1+m:n)*den(1:n-m)
write(*,*) 'failure in polint for point',x c(1:n-m)=ho(1:n-m)*den(1:n-m)
write(*,*) 'with input points: ',xa if (2*ns < n-m) then
stop dy=c(ns+1)
end if
den_val = (c(i+1) - d(i)) / den_val
d(i) = h * den_val
c(i) = hp * den_val
end do
if (2 * ns < n_m) then
dy = c(ns + 1)
else else
dy = d(ns) dy=d(ns)
ns = ns - 1 ns=ns-1
end if end if
y = y + dy y=y+dy
end do end do
return return
end subroutine polint end subroutine polint
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! !
@@ -1183,37 +1183,35 @@ end subroutine d2dump
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
implicit none implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a real*8, dimension(1:ordn), intent(in) :: x1a,x2a
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2 real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER !~~~~~~> Other parameters:
integer :: i,m integer :: i,m
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp real*8, dimension(ordn) :: yntmp
m=size(x1a) m=size(x1a)
do i=1,m do i=1,m
yntmp=ya(i,:) yntmp=ya(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do j=1,ordn
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do end do
call polint(x2a, ymtmp, x2, y, dy, ordn)
#endif call polint(x1a,ymtmp,x1,y,dy,ordn)
return return
end subroutine polin2 end subroutine polin2
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! !
@@ -1221,15 +1219,18 @@ end subroutine d2dump
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
implicit none implicit none
!~~~~~~> Input parameters:
integer,intent(in) :: ordn integer,intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2,x3 real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER !~~~~~~> Other parameters:
integer :: i,j,m,n integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
@@ -1238,33 +1239,24 @@ end subroutine d2dump
m=size(x1a) m=size(x1a)
n=size(x2a) n=size(x2a)
do i=1,m do i=1,m
do j=1,n do j=1,n
yqtmp=ya(i,j,:) yqtmp=ya(i,j,:)
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
end do end do
yntmp=yatmp(i,:) yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j, k
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do k=1,ordn
do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
end do
end do end do
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn) call polint(x1a,ymtmp,x1,y,dy,ordn)
end do
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif
return return
end subroutine polin3 end subroutine polin3
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm ! calculate L2norm

View File

@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
! dx^4 ! dx^4
! note the sign (-1)^r-1, now r=2 ! note the sign (-1)^r-1, now r=2
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)

View File

@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
end subroutine lopsided end subroutine lopsided
!-----------------------------------------------------------------------------
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
! Mathematically identical to calling lopsided then kodis separately.
!-----------------------------------------------------------------------------
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
implicit none
!~~~~~~> Input parameters:
integer, intent(in) :: ex(1:3),Symmetry
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,intent(in) :: eps
!~~~~~~> local variables:
! note index -2,-1,0, so we have 3 extra points
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
! kodis parameters
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8, parameter :: cof=6.4d1 ! 2^6
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
! Single symmetry_bd call shared by both advection and dissipation
call symmetry_bd(3,ex,f,fh,SoA)
! ---- Advection (lopsided) loop ----
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
endif
elseif(Sfx(i,j,k) < ZEO)then
if(i-3 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
endif
endif
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
endif
elseif(Sfy(i,j,k) < ZEO)then
if(j-3 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
endif
endif
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
endif
elseif(Sfz(i,j,k) < ZEO)then
if(k-3 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
enddo
enddo
enddo
! ---- Dissipation (kodis) loop ----
if(eps > ZEO) then
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo
enddo
enddo
endif
return
end subroutine lopsided_kodis
#elif (ghost_width == 4) #elif (ghost_width == 4)
! sixth order code ! sixth order code
! Compute advection terms in right hand sides of field equations ! Compute advection terms in right hand sides of field equations

View File

@@ -16,12 +16,6 @@ include makefile.inc
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -10,16 +10,16 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) ## Aggressive optimization flags:
## -fprofile-instr-use: use collected profile data to guide optimization decisions ## -O3: Maximum optimization
## (branch prediction, basic block layout, inlining, loop unrolling) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
PROFDATA = ../../pgo_profile/default.profdata ## -fp-model fast=2: Aggressive floating-point optimizations
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ ## -fma: Enable fused multiply-add instructions
-fprofile-instr-use=$(PROFDATA) \ ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
-Dfortran3 -Dnewc -I${MKLROOT}/include -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -xHost -fp-model fast=2 -fma \
-fprofile-instr-use=$(PROFDATA) \ -fpp -I${MKLROOT}/include
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx

View File

@@ -11,8 +11,6 @@
#include <strstream> #include <strstream>
#include <cmath> #include <cmath>
#include <map> #include <map>
#include <vector>
#include <algorithm>
using namespace std; using namespace std;
#else #else
#include <iostream.h> #include <iostream.h>
@@ -222,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
pox[2][n] = rex * nz_g[n]; pox[2][n] = rex * nz_g[n];
} }
double *shellf;
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp; Lp = n_tot - cpusize * mp;
if (Lp > myrank) if (Lp > myrank)
{ {
Nmin = myrank * mp + myrank; Nmin = myrank * mp + myrank;
@@ -236,10 +241,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf; //|~~~~~> Integrate the dot product of Dphi with the surface normal.
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
double *RP_out, *IP_out; double *RP_out, *IP_out;
RP_out = new double[NN]; RP_out = new double[NN];
IP_out = new double[NN]; IP_out = new double[NN];
@@ -360,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -562,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -750,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1008,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1452,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1896,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2091,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2286,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2383,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
pox[2][n] = rex * nz_g[n]; pox[2][n] = rex * nz_g[n];
} }
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level,
// so we do not need loop boxes
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
double Mass_out = 0;
double ang_outx, ang_outy, ang_outz;
double p_outx, p_outy, p_outz;
ang_outx = ang_outy = ang_outz = 0.0;
p_outx = p_outy = p_outz = 0.0;
const double f1o8 = 0.125;
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp; Lp = n_tot - cpusize * mp;
if (Lp > myrank) if (Lp > myrank)
{ {
Nmin = myrank * mp + myrank; Nmin = myrank * mp + myrank;
@@ -2397,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level,
// so we do not need loop boxes
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
double Mass_out = 0;
double ang_outx, ang_outy, ang_outz;
double p_outx, p_outy, p_outz;
ang_outx = ang_outy = ang_outz = 0.0;
p_outx = p_outy = p_outz = 0.0;
const double f1o8 = 0.125;
double Chi, Psi; double Chi, Psi;
double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz; double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz; double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2531,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -2800,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -3083,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -3668,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.

View File

@@ -253,7 +253,19 @@ def generate_macrodef_h():
# Define macro buffer_width # Define macro buffer_width
# number of buffer points for mesh-refinement interfaces # number of buffer points for mesh-refinement interfaces
print( "#define buffer_width 6", file=file1 ) # Calculate ghost_width based on Finite_Diffenence_Method to optimize buffer_width
if ( input_data.Finite_Diffenence_Method == "2nd-order" ):
gw = 2
elif ( input_data.Finite_Diffenence_Method == "4th-order" ):
gw = 3
elif ( input_data.Finite_Diffenence_Method == "6th-order" ):
gw = 4
elif ( input_data.Finite_Diffenence_Method == "8th-order" ):
gw = 5
else:
gw = 5 # Default conservative value
print( f"#define buffer_width {gw + 1}", file=file1 )
print( file=file1 ) print( file=file1 )
# Define macro SC_width as buffer_width # Define macro SC_width as buffer_width
@@ -392,6 +404,17 @@ def generate_macrodef_fh():
print( "# Finite_Difference_Method #define ghost_width setting error!!!", file=file1 ) print( "# Finite_Difference_Method #define ghost_width setting error!!!", file=file1 )
print( file=file1 ) print( file=file1 )
# Define macro DEBUG_NAN_CHECK
# 0: off (default), 1: on
debug_nan_check = getattr(input_data, "Debug_NaN_Check", 0)
if debug_nan_check:
print( "#define DEBUG_NAN_CHECK 1", file=file1 )
print( file=file1 )
else:
print( "#define DEBUG_NAN_CHECK 0", file=file1 )
print( file=file1 )
# Whether to use a shell-patch grid # Whether to use a shell-patch grid
# use shell or not # use shell or not
@@ -514,6 +537,9 @@ def generate_macrodef_fh():
print( " 6th order: 4", file=file1 ) print( " 6th order: 4", file=file1 )
print( " 8th order: 5", file=file1 ) print( " 8th order: 5", file=file1 )
print( file=file1 ) print( file=file1 )
print( "define DEBUG_NAN_CHECK", file=file1 )
print( " 0: off (default), 1: on", file=file1 )
print( file=file1 )
print( "define WithShell", file=file1 ) print( "define WithShell", file=file1 )
print( " use shell or not", file=file1 ) print( " use shell or not", file=file1 )
print( file=file1 ) print( file=file1 )

View File

@@ -36,6 +36,7 @@ Equation_Class = "BSSN" ## Evolution Equation: choose
Initial_Data_Method = "Ansorg-TwoPuncture" ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical" Initial_Data_Method = "Ansorg-TwoPuncture" ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
Time_Evolution_Method = "runge-kutta-45" ## time evolution method: choose "runge-kutta-45" Time_Evolution_Method = "runge-kutta-45" ## time evolution method: choose "runge-kutta-45"
Finite_Diffenence_Method = "4th-order" ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order" Finite_Diffenence_Method = "4th-order" ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
Debug_NaN_Check = 0 ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
################################################# #################################################

View File

@@ -10,18 +10,18 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
import time
## CPU core binding configuration using taskset ## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask ## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111" #NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95" NUMACTL_CPU_BIND = ""
## Build parallelism configuration ## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds ## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96 BUILD_JOBS = 14
################################################################## ##################################################################
@@ -118,7 +118,6 @@ def run_ABE():
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -154,14 +153,13 @@ def run_ABE():
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
def run_TwoPunctureABE(): def run_TwoPunctureABE():
tp_time1=time.time()
print( ) print( )
print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Define the command to run ## Define the command to run
#TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command = " ./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log" TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
@@ -182,9 +180,7 @@ def run_TwoPunctureABE():
print( ) print( )
print( " The TwoPunctureABE simulation is finished " ) print( " The TwoPunctureABE simulation is finished " )
print( ) print( )
tp_time2=time.time()
et=tp_time2-tp_time1
print(f"Used time: {et}")
return return
################################################################## ##################################################################

View File

@@ -1,29 +0,0 @@
import multiprocessing
def run_plot_task(task):
"""Execute a single plotting task.
Parameters
----------
task : tuple
A tuple of (function, args_tuple) where function is a callable
plotting function and args_tuple contains its arguments.
"""
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
"""Execute a list of independent plotting tasks in parallel.
Uses the 'fork' context to create worker processes so that the main
script is NOT re-imported/re-executed in child processes.
Parameters
----------
plot_tasks : list of tuples
Each element is (function, args_tuple).
"""
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

View File

@@ -1,97 +0,0 @@
# AMSS-NCKU PGO Profile Analysis Report
## 1. Profiling Environment
| Item | Value |
|------|-------|
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
| Instrumentation Flag | `-fprofile-instr-generate` |
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
| Merged Profile | `default.profdata` (394 KB) |
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
## 2. Reduced Simulation Parameters (for profiling run)
| Parameter | Production Value | Profiling Value |
|-----------|-----------------|-----------------|
| MPI_processes | 64 | 1 |
| grid_level | 9 | 4 |
| static_grid_level | 5 | 3 |
| static_grid_number | 96 | 24 |
| moving_grid_number | 48 | 16 |
| largest_box_xyz_max | 320^3 | 160^3 |
| Final_Evolution_Time | 1000.0 | 10.0 |
| Evolution_Step_Number | 10,000,000 | 1,000 |
| Detector_Number | 12 | 2 |
## 3. Profile Summary
| Metric | Value |
|--------|-------|
| Total instrumented functions | 1,392 |
| Functions with non-zero counts | 117 (8.4%) |
| Functions with zero counts | 1,275 (91.6%) |
| Maximum function entry count | 386,459,248 |
| Maximum internal block count | 370,477,680 |
| Total block count | 4,198,023,118 |
## 4. Top 20 Hotspot Functions
| Rank | Total Count | Max Block Count | Function | Category |
|------|------------|-----------------|----------|----------|
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
## 5. Hotspot Category Breakdown
Top 20 functions account for ~98% of total execution counts:
| Category | Functions | Combined Count | Share |
|----------|-----------|---------------|-------|
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
| Dissipation | kodis_ | ~92M | ~2% |
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
## 6. Conclusions
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
## 7. PGO Phase 2 Usage
To apply the profile, use the following flags in `makefile.inc`:
```makefile
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-align array64byte -fpp -I${MKLROOT}/include
```

Binary file not shown.

Binary file not shown.

View File

@@ -11,8 +11,6 @@
import numpy ## numpy for array operations import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing import scipy ## scipy for interpolation and signal processing
import math import math
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations import os ## os for system/file operations

View File

@@ -8,23 +8,16 @@
## ##
################################################# #################################################
## Restrict OpenMP to one thread per process so that running
## many workers in parallel does not create an O(workers * BLAS_threads)
## thread explosion. The variable MUST be set before numpy/scipy
## are imported, because the BLAS library reads them only at load time.
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
import numpy import numpy
import scipy import scipy
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import Axes3D
## import torch ## import torch
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
######################################################################################### #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
#################################################################################### ####################################################################################
####################################################################################
## Allow this module to be run as a standalone script so that each
## binary-data plot can be executed in a fresh subprocess whose BLAS
## environment variables (set above) take effect before numpy loads.
##
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
####################################################################################
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -8,8 +8,6 @@
################################################# #################################################
import numpy ## numpy for array operations import numpy ## numpy for array operations
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob import glob
@@ -17,9 +15,6 @@ import os ## operating system utilities
import plot_binary_data import plot_binary_data
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots # plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x) file_list.append(x)
print(x) print(x)
## Plot each file in parallel using subprocesses. ## Plot each file in the list
## Each subprocess is a fresh Python process where the BLAS thread-count
## environment variables (set at the top of plot_binary_data.py) take
## effect before numpy is imported. This avoids the thread explosion
## that occurs when multiprocessing.Pool with 'fork' context inherits
## already-initialized multi-threaded BLAS from the parent.
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
running = []
failed = []
for filename in file_list: for filename in file_list:
print(filename) print(filename)
proc = subprocess.Popen( plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
## Keep at most max_workers subprocesses active at a time
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
## Wait for all remaining subprocesses to finish
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
print( ) print( )
print( " Binary Data Plot Has been Finished " ) print( " Binary Data Plot Has been Finished " )