Compare commits
2 Commits
cjy-oneapi
...
yx-fmisc
| Author | SHA1 | Date | |
|---|---|---|---|
| 3f7e20f702 | |||
| 673dd20722 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,6 +1,3 @@
|
||||
__pycache__
|
||||
GW150914
|
||||
GW150914-origin
|
||||
docs
|
||||
*.tmp
|
||||
|
||||
GW150914-origin
|
||||
445
AMSS_NCKU_ABEtest.py
Normal file
445
AMSS_NCKU_ABEtest.py
Normal file
@@ -0,0 +1,445 @@
|
||||
|
||||
##################################################################
|
||||
##
|
||||
## AMSS-NCKU ABE Test Program (Skip TwoPuncture if data exists)
|
||||
## Modified from AMSS_NCKU_Program.py
|
||||
## Author: Xiaoqu
|
||||
## Modified: 2026/02/01
|
||||
##
|
||||
##################################################################
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
## Print program introduction
|
||||
|
||||
import print_information
|
||||
|
||||
print_information.print_program_introduction()
|
||||
|
||||
##################################################################
|
||||
|
||||
import AMSS_NCKU_Input as input_data
|
||||
|
||||
##################################################################
|
||||
|
||||
## Create directories to store program run data
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
|
||||
## Set the output directory according to the input file
|
||||
File_directory = os.path.join(input_data.File_directory)
|
||||
|
||||
## Check if output directory exists and if TwoPuncture data is available
|
||||
skip_twopuncture = False
|
||||
output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
|
||||
binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
|
||||
|
||||
if os.path.exists(File_directory):
|
||||
print( " Output directory already exists." )
|
||||
print()
|
||||
|
||||
# Check if TwoPuncture initial data files exist
|
||||
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture"):
|
||||
twopuncture_output = os.path.join(output_directory, "TwoPunctureABE")
|
||||
input_par = os.path.join(output_directory, "input.par")
|
||||
|
||||
if os.path.exists(twopuncture_output) and os.path.exists(input_par):
|
||||
print( " Found existing TwoPuncture initial data." )
|
||||
print( " Do you want to skip TwoPuncture phase and reuse existing data?" )
|
||||
print( " Input 'skip' to skip TwoPuncture and start ABE directly" )
|
||||
print( " Input 'regenerate' to regenerate everything from scratch" )
|
||||
print()
|
||||
|
||||
while True:
|
||||
try:
|
||||
inputvalue = input()
|
||||
if ( inputvalue == "skip" ):
|
||||
print( " Skipping TwoPuncture phase, will reuse existing initial data." )
|
||||
print()
|
||||
skip_twopuncture = True
|
||||
break
|
||||
elif ( inputvalue == "regenerate" ):
|
||||
print( " Regenerating everything from scratch." )
|
||||
print()
|
||||
skip_twopuncture = False
|
||||
break
|
||||
else:
|
||||
print( " Please input 'skip' or 'regenerate'." )
|
||||
except ValueError:
|
||||
print( " Please input 'skip' or 'regenerate'." )
|
||||
else:
|
||||
print( " TwoPuncture initial data not found, will regenerate everything." )
|
||||
print()
|
||||
|
||||
# If not skipping, remove and recreate directory
|
||||
if not skip_twopuncture:
|
||||
shutil.rmtree(File_directory, ignore_errors=True)
|
||||
os.mkdir(File_directory)
|
||||
os.mkdir(output_directory)
|
||||
os.mkdir(binary_results_directory)
|
||||
figure_directory = os.path.join(File_directory, "figure")
|
||||
os.mkdir(figure_directory)
|
||||
shutil.copy("AMSS_NCKU_Input.py", File_directory)
|
||||
print( " Output directory has been regenerated." )
|
||||
print()
|
||||
else:
|
||||
# Create fresh directory structure
|
||||
os.mkdir(File_directory)
|
||||
shutil.copy("AMSS_NCKU_Input.py", File_directory)
|
||||
os.mkdir(output_directory)
|
||||
os.mkdir(binary_results_directory)
|
||||
figure_directory = os.path.join(File_directory, "figure")
|
||||
os.mkdir(figure_directory)
|
||||
print( " Output directory has been generated." )
|
||||
print()
|
||||
|
||||
# Ensure figure directory exists
|
||||
figure_directory = os.path.join(File_directory, "figure")
|
||||
if not os.path.exists(figure_directory):
|
||||
os.mkdir(figure_directory)
|
||||
|
||||
##################################################################
|
||||
|
||||
## Output related parameter information
|
||||
|
||||
import setup
|
||||
|
||||
## Print and save input parameter information
|
||||
setup.print_input_data( File_directory )
|
||||
|
||||
if not skip_twopuncture:
|
||||
setup.generate_AMSSNCKU_input()
|
||||
|
||||
setup.print_puncture_information()
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
## Generate AMSS-NCKU program input files based on the configured parameters
|
||||
|
||||
if not skip_twopuncture:
|
||||
print()
|
||||
print( " Generating the AMSS-NCKU input parfile for the ABE executable." )
|
||||
print()
|
||||
|
||||
## Generate cgh-related input files from the grid information
|
||||
|
||||
import numerical_grid
|
||||
|
||||
numerical_grid.append_AMSSNCKU_cgh_input()
|
||||
|
||||
print()
|
||||
print( " The input parfile for AMSS-NCKU C++ executable file ABE has been generated." )
|
||||
print( " However, the input relevant to TwoPuncture need to be appended later." )
|
||||
print()
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
## Plot the initial grid configuration
|
||||
|
||||
if not skip_twopuncture:
|
||||
print()
|
||||
print( " Schematically plot the numerical grid structure." )
|
||||
print()
|
||||
|
||||
import numerical_grid
|
||||
numerical_grid.plot_initial_grid()
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
## Generate AMSS-NCKU macro files according to the numerical scheme and parameters
|
||||
|
||||
if not skip_twopuncture:
|
||||
print()
|
||||
print( " Automatically generating the macro file for AMSS-NCKU C++ executable file ABE " )
|
||||
print( " (Based on the finite-difference numerical scheme) " )
|
||||
print()
|
||||
|
||||
import generate_macrodef
|
||||
|
||||
generate_macrodef.generate_macrodef_h()
|
||||
print( " AMSS-NCKU macro file macrodef.h has been generated. " )
|
||||
|
||||
generate_macrodef.generate_macrodef_fh()
|
||||
print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
# Compile the AMSS-NCKU program according to user requirements
|
||||
# NOTE: ABE compilation is always performed, even when skipping TwoPuncture
|
||||
|
||||
print()
|
||||
print( " Preparing to compile and run the AMSS-NCKU code as requested " )
|
||||
print( " Compiling the AMSS-NCKU code based on the generated macro files " )
|
||||
print()
|
||||
|
||||
AMSS_NCKU_source_path = "AMSS_NCKU_source"
|
||||
AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
|
||||
|
||||
## If AMSS_NCKU source folder is missing, create it and prompt the user
|
||||
if not os.path.exists(AMSS_NCKU_source_path):
|
||||
os.makedirs(AMSS_NCKU_source_path)
|
||||
print( " The AMSS-NCKU source files are incomplete; copy all source files into ./AMSS_NCKU_source. " )
|
||||
print( " Press Enter to continue. " )
|
||||
inputvalue = input()
|
||||
|
||||
# Copy AMSS-NCKU source files to prepare for compilation
|
||||
# If skipping TwoPuncture and source_copy already exists, remove it first
|
||||
if skip_twopuncture and os.path.exists(AMSS_NCKU_source_copy):
|
||||
shutil.rmtree(AMSS_NCKU_source_copy)
|
||||
|
||||
shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
|
||||
|
||||
# Copy the generated macro files into the AMSS_NCKU source folder
|
||||
if not skip_twopuncture:
|
||||
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
|
||||
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
|
||||
else:
|
||||
# When skipping TwoPuncture, use existing macro files from previous run
|
||||
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
|
||||
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
|
||||
|
||||
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
|
||||
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
|
||||
|
||||
# Compile related programs
|
||||
import makefile_and_run
|
||||
|
||||
## Change working directory to the target source copy
|
||||
os.chdir(AMSS_NCKU_source_copy)
|
||||
|
||||
## Build the main AMSS-NCKU executable (ABE or ABEGPU)
|
||||
makefile_and_run.makefile_ABE()
|
||||
|
||||
## If the initial-data method is Ansorg-TwoPuncture, build the TwoPunctureABE executable
|
||||
## Only build TwoPunctureABE if not skipping TwoPuncture phase
|
||||
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
|
||||
makefile_and_run.makefile_TwoPunctureABE()
|
||||
|
||||
## Change current working directory back up two levels
|
||||
os.chdir('..')
|
||||
os.chdir('..')
|
||||
|
||||
print()
|
||||
|
||||
##################################################################
|
||||
|
||||
## Copy the AMSS-NCKU executable (ABE/ABEGPU) to the run directory
|
||||
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
|
||||
|
||||
if not os.path.exists( ABE_file ):
|
||||
print()
|
||||
print( " Lack of AMSS-NCKU executable file ABE/ABEGPU; recompile AMSS_NCKU_source manually. " )
|
||||
print( " When recompilation is finished, press Enter to continue. " )
|
||||
inputvalue = input()
|
||||
|
||||
## Copy the executable ABE (or ABEGPU) into the run directory
|
||||
shutil.copy2(ABE_file, output_directory)
|
||||
|
||||
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
||||
## Only copy TwoPunctureABE if not skipping TwoPuncture phase
|
||||
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
|
||||
TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
|
||||
|
||||
if not os.path.exists( TwoPuncture_file ):
|
||||
print()
|
||||
print( " Lack of AMSS-NCKU executable file TwoPunctureABE; recompile TwoPunctureABE in AMSS_NCKU_source. " )
|
||||
print( " When recompilation is finished, press Enter to continue. " )
|
||||
inputvalue = input()
|
||||
|
||||
## Copy the TwoPunctureABE executable into the run directory
|
||||
shutil.copy2(TwoPuncture_file, output_directory)
|
||||
|
||||
##################################################################
|
||||
|
||||
## If the initial-data method is TwoPuncture, generate the TwoPuncture input files
|
||||
|
||||
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
|
||||
|
||||
print()
|
||||
print( " Initial data is chosen as Ansorg-TwoPuncture" )
|
||||
print()
|
||||
|
||||
print()
|
||||
print( " Automatically generating the input parfile for the TwoPunctureABE executable " )
|
||||
print()
|
||||
|
||||
import generate_TwoPuncture_input
|
||||
|
||||
generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
|
||||
|
||||
print()
|
||||
print( " The input parfile for the TwoPunctureABE executable has been generated. " )
|
||||
print()
|
||||
|
||||
## Generated AMSS-NCKU TwoPuncture input filename
|
||||
AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
|
||||
AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
|
||||
|
||||
## Copy and rename the file
|
||||
shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
|
||||
|
||||
## Run TwoPuncture to generate initial-data files
|
||||
|
||||
start_time = time.time() # Record start time
|
||||
|
||||
print()
|
||||
print()
|
||||
|
||||
## Change to the output (run) directory
|
||||
os.chdir(output_directory)
|
||||
|
||||
## Run the TwoPuncture executable
|
||||
import makefile_and_run
|
||||
makefile_and_run.run_TwoPunctureABE()
|
||||
|
||||
## Change current working directory back up two levels
|
||||
os.chdir('..')
|
||||
os.chdir('..')
|
||||
|
||||
elif (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and skip_twopuncture:
|
||||
print()
|
||||
print( " Skipping TwoPuncture execution, using existing initial data." )
|
||||
print()
|
||||
start_time = time.time() # Record start time for ABE only
|
||||
else:
|
||||
start_time = time.time() # Record start time
|
||||
|
||||
##################################################################
|
||||
|
||||
## Update puncture data based on TwoPuncture run results
|
||||
|
||||
if not skip_twopuncture:
|
||||
import renew_puncture_parameter
|
||||
renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
|
||||
|
||||
## Generated AMSS-NCKU input filename
|
||||
AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
|
||||
AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
|
||||
|
||||
## Copy and rename the file
|
||||
shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
|
||||
|
||||
print()
|
||||
print( " Successfully copy all AMSS-NCKU input parfile to target dictionary. " )
|
||||
print()
|
||||
else:
|
||||
print()
|
||||
print( " Using existing input.par file from previous run." )
|
||||
print()
|
||||
|
||||
##################################################################
|
||||
|
||||
## Launch the AMSS-NCKU program
|
||||
|
||||
print()
|
||||
print()
|
||||
|
||||
## Change to the run directory
|
||||
os.chdir( output_directory )
|
||||
|
||||
import makefile_and_run
|
||||
makefile_and_run.run_ABE()
|
||||
|
||||
## Change current working directory back up two levels
|
||||
os.chdir('..')
|
||||
os.chdir('..')
|
||||
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
|
||||
##################################################################
|
||||
|
||||
## Copy some basic input and log files out to facilitate debugging
|
||||
|
||||
## Path to the file that stores calculation settings
|
||||
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
|
||||
## Copy and rename the file for easier inspection
|
||||
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
|
||||
|
||||
## Path to the error log file
|
||||
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
|
||||
## Copy and rename the error log
|
||||
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
|
||||
|
||||
## Primary program outputs
|
||||
AMSS_NCKU_BH_data = os.path.join(binary_results_directory, "bssn_BH.dat" )
|
||||
AMSS_NCKU_ADM_data = os.path.join(binary_results_directory, "bssn_ADMQs.dat" )
|
||||
AMSS_NCKU_psi4_data = os.path.join(binary_results_directory, "bssn_psi4.dat" )
|
||||
AMSS_NCKU_constraint_data = os.path.join(binary_results_directory, "bssn_constraint.dat")
|
||||
## copy and rename the file
|
||||
shutil.copy( AMSS_NCKU_BH_data, os.path.join(output_directory, "bssn_BH.dat" ) )
|
||||
shutil.copy( AMSS_NCKU_ADM_data, os.path.join(output_directory, "bssn_ADMQs.dat" ) )
|
||||
shutil.copy( AMSS_NCKU_psi4_data, os.path.join(output_directory, "bssn_psi4.dat" ) )
|
||||
shutil.copy( AMSS_NCKU_constraint_data, os.path.join(output_directory, "bssn_constraint.dat") )
|
||||
|
||||
## Additional program outputs
|
||||
if (input_data.Equation_Class == "BSSN-EM"):
|
||||
AMSS_NCKU_phi1_data = os.path.join(binary_results_directory, "bssn_phi1.dat" )
|
||||
AMSS_NCKU_phi2_data = os.path.join(binary_results_directory, "bssn_phi2.dat" )
|
||||
shutil.copy( AMSS_NCKU_phi1_data, os.path.join(output_directory, "bssn_phi1.dat" ) )
|
||||
shutil.copy( AMSS_NCKU_phi2_data, os.path.join(output_directory, "bssn_phi2.dat" ) )
|
||||
elif (input_data.Equation_Class == "BSSN-EScalar"):
|
||||
AMSS_NCKU_maxs_data = os.path.join(binary_results_directory, "bssn_maxs.dat" )
|
||||
shutil.copy( AMSS_NCKU_maxs_data, os.path.join(output_directory, "bssn_maxs.dat" ) )
|
||||
|
||||
##################################################################
|
||||
|
||||
## Plot the AMSS-NCKU program results
|
||||
|
||||
print()
|
||||
print( " Plotting the txt and binary results data from the AMSS-NCKU simulation " )
|
||||
print()
|
||||
|
||||
|
||||
import plot_xiaoqu
|
||||
import plot_GW_strain_amplitude_xiaoqu
|
||||
|
||||
## Plot black hole trajectory
|
||||
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
|
||||
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
|
||||
|
||||
## Plot black hole separation vs. time
|
||||
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
|
||||
|
||||
## Plot gravitational waveforms (psi4 and strain amplitude)
|
||||
for i in range(input_data.Detector_Number):
|
||||
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
|
||||
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot ADM mass evolution
|
||||
for i in range(input_data.Detector_Number):
|
||||
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot Hamiltonian constraint violation over time
|
||||
for i in range(input_data.grid_level):
|
||||
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot stored binary data
|
||||
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
|
||||
|
||||
print()
|
||||
print( f" This Program Cost = {elapsed_time} Seconds " )
|
||||
print()
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
print()
|
||||
print( " The AMSS-NCKU-Python simulation is successfully finished, thanks for using !!! " )
|
||||
print()
|
||||
|
||||
##################################################################
|
||||
|
||||
|
||||
@@ -8,14 +8,6 @@
|
||||
##
|
||||
##################################################################
|
||||
|
||||
## Guard against re-execution by multiprocessing child processes.
|
||||
## Without this, using 'spawn' or 'forkserver' context would cause every
|
||||
## worker to re-run the entire script, spawning exponentially more
|
||||
## workers (fork bomb).
|
||||
if __name__ != '__main__':
|
||||
import sys as _sys
|
||||
_sys.exit(0)
|
||||
|
||||
|
||||
##################################################################
|
||||
|
||||
@@ -270,12 +262,6 @@ if not os.path.exists( ABE_file ):
|
||||
## Copy the executable ABE (or ABEGPU) into the run directory
|
||||
shutil.copy2(ABE_file, output_directory)
|
||||
|
||||
## Copy interp load balance profile if present (for optimize pass)
|
||||
interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
|
||||
if os.path.exists(interp_lb_profile):
|
||||
shutil.copy2(interp_lb_profile, output_directory)
|
||||
print( " Copied interp_lb_profile.bin to run directory " )
|
||||
|
||||
###########################
|
||||
|
||||
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
||||
@@ -438,31 +424,26 @@ print(
|
||||
|
||||
import plot_xiaoqu
|
||||
import plot_GW_strain_amplitude_xiaoqu
|
||||
from parallel_plot_helper import run_plot_tasks_parallel
|
||||
|
||||
plot_tasks = []
|
||||
|
||||
## Plot black hole trajectory
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) )
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
|
||||
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
|
||||
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
|
||||
|
||||
## Plot black hole separation vs. time
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
|
||||
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
|
||||
|
||||
## Plot gravitational waveforms (psi4 and strain amplitude)
|
||||
for i in range(input_data.Detector_Number):
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
|
||||
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
|
||||
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
|
||||
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot ADM mass evolution
|
||||
for i in range(input_data.Detector_Number):
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
|
||||
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot Hamiltonian constraint violation over time
|
||||
for i in range(input_data.grid_level):
|
||||
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
|
||||
|
||||
run_plot_tasks_parallel(plot_tasks)
|
||||
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
|
||||
|
||||
## Plot stored binary data
|
||||
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
|
||||
|
||||
@@ -277,3 +277,4 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -37,51 +37,57 @@ close(77)
|
||||
end program checkFFT
|
||||
#endif
|
||||
|
||||
!-------------
|
||||
! Optimized FFT using Intel oneMKL DFTI
|
||||
! Mathematical equivalence: Standard DFT definition
|
||||
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
|
||||
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
|
||||
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
|
||||
!-------------
|
||||
SUBROUTINE four1(dataa,nn,isign)
|
||||
use MKL_DFTI
|
||||
implicit none
|
||||
INTEGER, intent(in) :: isign, nn
|
||||
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
|
||||
|
||||
type(DFTI_DESCRIPTOR), pointer :: desc
|
||||
integer :: status
|
||||
|
||||
! Create DFTI descriptor for 1D complex-to-complex transform
|
||||
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
|
||||
if (status /= 0) return
|
||||
|
||||
! Set input/output storage as interleaved complex (default)
|
||||
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
|
||||
if (status /= 0) then
|
||||
status = DftiFreeDescriptor(desc)
|
||||
return
|
||||
INTEGER::isign,nn
|
||||
double precision,dimension(2*nn)::dataa
|
||||
INTEGER::i,istep,j,m,mmax,n
|
||||
double precision::tempi,tempr
|
||||
DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
|
||||
n=2*nn
|
||||
j=1
|
||||
do i=1,n,2
|
||||
if(j.gt.i)then
|
||||
tempr=dataa(j)
|
||||
tempi=dataa(j+1)
|
||||
dataa(j)=dataa(i)
|
||||
dataa(j+1)=dataa(i+1)
|
||||
dataa(i)=tempr
|
||||
dataa(i+1)=tempi
|
||||
endif
|
||||
m=nn
|
||||
1 if ((m.ge.2).and.(j.gt.m)) then
|
||||
j=j-m
|
||||
m=m/2
|
||||
goto 1
|
||||
endif
|
||||
j=j+m
|
||||
enddo
|
||||
mmax=2
|
||||
2 if (n.gt.mmax) then
|
||||
istep=2*mmax
|
||||
theta=6.28318530717959d0/(isign*mmax)
|
||||
wpr=-2.d0*sin(0.5d0*theta)**2
|
||||
wpi=sin(theta)
|
||||
wr=1.d0
|
||||
wi=0.d0
|
||||
do m=1,mmax,2
|
||||
do i=m,n,istep
|
||||
j=i+mmax
|
||||
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
|
||||
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
|
||||
dataa(j)=dataa(i)-tempr
|
||||
dataa(j+1)=dataa(i+1)-tempi
|
||||
dataa(i)=dataa(i)+tempr
|
||||
dataa(i+1)=dataa(i+1)+tempi
|
||||
enddo
|
||||
wtemp=wr
|
||||
wr=wr*wpr-wi*wpi+wr
|
||||
wi=wi*wpr+wtemp*wpi+wi
|
||||
enddo
|
||||
mmax=istep
|
||||
goto 2
|
||||
endif
|
||||
|
||||
! Commit the descriptor
|
||||
status = DftiCommitDescriptor(desc)
|
||||
if (status /= 0) then
|
||||
status = DftiFreeDescriptor(desc)
|
||||
return
|
||||
endif
|
||||
|
||||
! Execute FFT based on direction
|
||||
if (isign == 1) then
|
||||
! Forward FFT: exp(-2*pi*i*k*n/N)
|
||||
status = DftiComputeForward(desc, dataa)
|
||||
else
|
||||
! Backward FFT: exp(+2*pi*i*k*n/N)
|
||||
status = DftiComputeBackward(desc, dataa)
|
||||
endif
|
||||
|
||||
! Free descriptor
|
||||
status = DftiFreeDescriptor(desc)
|
||||
|
||||
return
|
||||
END SUBROUTINE four1
|
||||
|
||||
@@ -13,9 +13,6 @@ using namespace std;
|
||||
#include "MPatch.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
|
||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||
{
|
||||
@@ -344,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
double *Shellf, int Symmetry)
|
||||
{
|
||||
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine
|
||||
int myrank, nprocs;
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
int ordn = 2 * ghost_width;
|
||||
MyList<var> *varl;
|
||||
@@ -358,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
varl = varl->next;
|
||||
}
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
double *shellf;
|
||||
shellf = new double[NN * num_var];
|
||||
memset(shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
// All ranks traverse the same block list so they all agree on ownership
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
// we use weight to monitor code, later some day we can move it for optimization
|
||||
int *weight;
|
||||
weight = new int[NN];
|
||||
memset(weight, 0, sizeof(int) * NN);
|
||||
|
||||
double *DH, *llb, *uub;
|
||||
DH = new double[dim];
|
||||
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
DH[i] = getdX(i);
|
||||
}
|
||||
llb = new double[dim];
|
||||
uub = new double[dim];
|
||||
|
||||
for (int j = 0; j < NN; j++) // run along points
|
||||
{
|
||||
@@ -401,6 +403,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
// NOTE: our dividing structure is (exclude ghost)
|
||||
// -1 0
|
||||
// 1 2
|
||||
// so (0,1) does not belong to any part for vertex structure
|
||||
// here we put (0,0.5) to left part and (0.5,1) to right part
|
||||
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
@@ -425,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
@@ -433,11 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
|
||||
// pox,ordn,varl->data->SoA,Symmetry);
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
weight[j] = 1;
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
@@ -446,360 +456,103 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
// Replace MPI_Allreduce with per-owner MPI_Bcast:
|
||||
// Group consecutive points by owner rank and broadcast each group.
|
||||
// Since each point's data is non-zero only on the owner rank,
|
||||
// Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
|
||||
MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
int *Weight;
|
||||
Weight = new int[NN];
|
||||
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
// misc::tillherecheck("print me");
|
||||
|
||||
for (int i = 0; i < NN; i++)
|
||||
{
|
||||
int j = 0;
|
||||
while (j < NN)
|
||||
if (Weight[i] > 1)
|
||||
{
|
||||
int cur_owner = owner_rank[j];
|
||||
if (cur_owner < 0)
|
||||
{
|
||||
if (myrank == 0)
|
||||
{
|
||||
cout << "ERROR: Patch::Interp_Points fails to find point (";
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
cout << XX[d][j];
|
||||
if (d < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")";
|
||||
}
|
||||
cout << " on Patch (";
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
cout << bbox[d] << "+" << lli[d] * DH[d];
|
||||
if (d < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")--";
|
||||
}
|
||||
cout << "(";
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
|
||||
if (d < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")" << endl;
|
||||
}
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
// Find contiguous run of points with the same owner
|
||||
int jstart = j;
|
||||
while (j < NN && owner_rank[j] == cur_owner)
|
||||
j++;
|
||||
int count = (j - jstart) * num_var;
|
||||
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
|
||||
if (myrank == 0)
|
||||
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
|
||||
for (int j = 0; j < num_var; j++)
|
||||
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
|
||||
}
|
||||
}
|
||||
|
||||
delete[] owner_rank;
|
||||
}
|
||||
void Patch::Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetry,
|
||||
int Nmin_consumer, int Nmax_consumer)
|
||||
{
|
||||
// Targeted point-to-point overload: each owner sends each point only to
|
||||
// the one rank that needs it for integration (consumer), reducing
|
||||
// communication volume by ~nprocs times compared to the Bcast version.
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
double t_interp_start = MPI_Wtime();
|
||||
#endif
|
||||
int myrank, nprocs;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||
|
||||
int ordn = 2 * ghost_width;
|
||||
MyList<var> *varl;
|
||||
int num_var = 0;
|
||||
varl = VarList;
|
||||
while (varl)
|
||||
{
|
||||
num_var++;
|
||||
varl = varl->next;
|
||||
}
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
DH[i] = getdX(i);
|
||||
|
||||
// --- Interpolation phase (identical to original) ---
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
double pox[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
pox[i] = XX[i][j];
|
||||
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
|
||||
{
|
||||
cout << "Patch::Interp_Points: point (";
|
||||
for (int k = 0; k < dim; k++)
|
||||
{
|
||||
cout << XX[k][j];
|
||||
if (k < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ") is out of current Patch." << endl;
|
||||
}
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
MyList<Block> *Bp = blb;
|
||||
bool notfind = true;
|
||||
while (notfind && Bp)
|
||||
{
|
||||
Block *BP = Bp->data;
|
||||
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||
{
|
||||
flag = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
double t_interp_end = MPI_Wtime();
|
||||
double t_interp_local = t_interp_end - t_interp_start;
|
||||
#endif
|
||||
|
||||
// --- Error check for unfound points ---
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
if (owner_rank[j] < 0 && myrank == 0)
|
||||
else if (Weight[i] == 0 && myrank == 0)
|
||||
{
|
||||
cout << "ERROR: Patch::Interp_Points fails to find point (";
|
||||
for (int d = 0; d < dim; d++)
|
||||
for (int j = 0; j < dim; j++)
|
||||
{
|
||||
cout << XX[d][j];
|
||||
if (d < dim - 1)
|
||||
cout << XX[j][i];
|
||||
if (j < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")";
|
||||
}
|
||||
cout << " on Patch (";
|
||||
for (int d = 0; d < dim; d++)
|
||||
for (int j = 0; j < dim; j++)
|
||||
{
|
||||
cout << bbox[d] << "+" << lli[d] * DH[d];
|
||||
if (d < dim - 1)
|
||||
cout << bbox[j] << "+" << lli[j] * getdX(j);
|
||||
if (j < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")--";
|
||||
}
|
||||
cout << "(";
|
||||
for (int d = 0; d < dim; d++)
|
||||
for (int j = 0; j < dim; j++)
|
||||
{
|
||||
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
|
||||
if (d < dim - 1)
|
||||
cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
|
||||
if (j < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")" << endl;
|
||||
}
|
||||
#if 0
|
||||
checkBlock();
|
||||
#else
|
||||
cout << "splited domains:" << endl;
|
||||
{
|
||||
MyList<Block> *Bp = blb;
|
||||
while (Bp)
|
||||
{
|
||||
Block *BP = Bp->data;
|
||||
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
cout << "(";
|
||||
for (int j = 0; j < dim; j++)
|
||||
{
|
||||
cout << llb[j] << ":" << uub[j];
|
||||
if (j < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ")" << endl;
|
||||
}
|
||||
if (Bp == ble)
|
||||
break;
|
||||
Bp = Bp->next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// --- Targeted point-to-point communication phase ---
|
||||
// Compute consumer_rank[j] using the same deterministic formula as surface_integral
|
||||
int *consumer_rank = new int[NN];
|
||||
{
|
||||
int mp = NN / nprocs;
|
||||
int Lp = NN - nprocs * mp;
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
if (j < Lp * (mp + 1))
|
||||
consumer_rank[j] = j / (mp + 1);
|
||||
else
|
||||
consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
|
||||
}
|
||||
}
|
||||
|
||||
// Count sends and recvs per rank
|
||||
int *send_count = new int[nprocs];
|
||||
int *recv_count = new int[nprocs];
|
||||
memset(send_count, 0, sizeof(int) * nprocs);
|
||||
memset(recv_count, 0, sizeof(int) * nprocs);
|
||||
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
int own = owner_rank[j];
|
||||
int con = consumer_rank[j];
|
||||
if (own == con)
|
||||
continue; // local — no communication needed
|
||||
if (own == myrank)
|
||||
send_count[con]++;
|
||||
if (con == myrank)
|
||||
recv_count[own]++;
|
||||
}
|
||||
|
||||
// Build send buffers: for each destination rank, pack (index, data) pairs
|
||||
// Each entry: 1 int (point index j) + num_var doubles
|
||||
int total_send = 0, total_recv = 0;
|
||||
int *send_offset = new int[nprocs];
|
||||
int *recv_offset = new int[nprocs];
|
||||
for (int r = 0; r < nprocs; r++)
|
||||
{
|
||||
send_offset[r] = total_send;
|
||||
total_send += send_count[r];
|
||||
recv_offset[r] = total_recv;
|
||||
total_recv += recv_count[r];
|
||||
}
|
||||
|
||||
// Pack send buffers: each message contains (j, data[0..num_var-1]) per point
|
||||
int stride = 1 + num_var; // 1 double for index + num_var doubles for data
|
||||
double *sendbuf = new double[total_send * stride];
|
||||
double *recvbuf = new double[total_recv * stride];
|
||||
|
||||
// Temporary counters for packing
|
||||
int *pack_pos = new int[nprocs];
|
||||
memset(pack_pos, 0, sizeof(int) * nprocs);
|
||||
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
int own = owner_rank[j];
|
||||
int con = consumer_rank[j];
|
||||
if (own != myrank || con == myrank)
|
||||
continue;
|
||||
int pos = (send_offset[con] + pack_pos[con]) * stride;
|
||||
sendbuf[pos] = (double)j; // point index
|
||||
for (int v = 0; v < num_var; v++)
|
||||
sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
|
||||
pack_pos[con]++;
|
||||
}
|
||||
|
||||
// Post non-blocking recvs and sends
|
||||
int n_req = 0;
|
||||
for (int r = 0; r < nprocs; r++)
|
||||
{
|
||||
if (recv_count[r] > 0) n_req++;
|
||||
if (send_count[r] > 0) n_req++;
|
||||
}
|
||||
|
||||
MPI_Request *reqs = new MPI_Request[n_req];
|
||||
int req_idx = 0;
|
||||
|
||||
for (int r = 0; r < nprocs; r++)
|
||||
{
|
||||
if (recv_count[r] > 0)
|
||||
{
|
||||
MPI_Irecv(recvbuf + recv_offset[r] * stride,
|
||||
recv_count[r] * stride, MPI_DOUBLE,
|
||||
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
|
||||
}
|
||||
}
|
||||
for (int r = 0; r < nprocs; r++)
|
||||
{
|
||||
if (send_count[r] > 0)
|
||||
{
|
||||
MPI_Isend(sendbuf + send_offset[r] * stride,
|
||||
send_count[r] * stride, MPI_DOUBLE,
|
||||
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_req > 0)
|
||||
MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
|
||||
|
||||
// Unpack recv buffers into Shellf
|
||||
for (int i = 0; i < total_recv; i++)
|
||||
{
|
||||
int pos = i * stride;
|
||||
int j = (int)recvbuf[pos];
|
||||
for (int v = 0; v < num_var; v++)
|
||||
Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
|
||||
}
|
||||
|
||||
delete[] reqs;
|
||||
delete[] sendbuf;
|
||||
delete[] recvbuf;
|
||||
delete[] pack_pos;
|
||||
delete[] send_offset;
|
||||
delete[] recv_offset;
|
||||
delete[] send_count;
|
||||
delete[] recv_count;
|
||||
delete[] consumer_rank;
|
||||
delete[] owner_rank;
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
{
|
||||
static bool profile_written = false;
|
||||
if (!profile_written) {
|
||||
double *all_times = nullptr;
|
||||
if (myrank == 0) all_times = new double[nprocs];
|
||||
MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
|
||||
all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||
if (myrank == 0) {
|
||||
int heavy[64];
|
||||
int nh = InterpLBProfile::identify_heavy_ranks(
|
||||
all_times, nprocs, 2.5, heavy, 64);
|
||||
InterpLBProfile::write_profile(
|
||||
"interp_lb_profile.bin", nprocs,
|
||||
all_times, heavy, nh, 2.5);
|
||||
printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
|
||||
for (int i = 0; i < nh; i++)
|
||||
printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
|
||||
delete[] all_times;
|
||||
}
|
||||
profile_written = true;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
delete[] shellf;
|
||||
delete[] weight;
|
||||
delete[] Weight;
|
||||
delete[] DH;
|
||||
delete[] llb;
|
||||
delete[] uub;
|
||||
}
|
||||
void Patch::Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
@@ -820,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
varl = varl->next;
|
||||
}
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
double *shellf;
|
||||
shellf = new double[NN * num_var];
|
||||
memset(shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] stores the global rank that owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
// we use weight to monitor code, later some day we can move it for optimization
|
||||
int *weight;
|
||||
weight = new int[NN];
|
||||
memset(weight, 0, sizeof(int) * NN);
|
||||
|
||||
// Build global-to-local rank translation for Comm_here
|
||||
MPI_Group world_group, local_group;
|
||||
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
||||
MPI_Comm_group(Comm_here, &local_group);
|
||||
double *DH, *llb, *uub;
|
||||
DH = new double[dim];
|
||||
|
||||
double DH[dim], llb[dim], uub[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
DH[i] = getdX(i);
|
||||
}
|
||||
llb = new double[dim];
|
||||
uub = new double[dim];
|
||||
|
||||
for (int j = 0; j < NN; j++) // run along points
|
||||
{
|
||||
@@ -867,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
bool flag = true;
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
// NOTE: our dividing structure is (exclude ghost)
|
||||
// -1 0
|
||||
// 1 2
|
||||
// so (0,1) does not belong to any part for vertex structure
|
||||
// here we put (0,0.5) to left part and (0.5,1) to right part
|
||||
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
@@ -891,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
if (flag)
|
||||
{
|
||||
notfind = false;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
@@ -899,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
|
||||
// pox,ordn,varl->data->SoA,Symmetry);
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
weight[j] = 1;
|
||||
}
|
||||
}
|
||||
if (Bp == ble)
|
||||
@@ -912,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique global owner ranks and translate to local ranks in Comm_here
|
||||
// Then broadcast each owner's points via MPI_Bcast on Comm_here
|
||||
{
|
||||
int j = 0;
|
||||
while (j < NN)
|
||||
{
|
||||
int cur_owner_global = owner_rank[j];
|
||||
if (cur_owner_global < 0)
|
||||
{
|
||||
// Point not found — skip (error check disabled for sub-communicator levels)
|
||||
j++;
|
||||
continue;
|
||||
}
|
||||
// Translate global rank to local rank in Comm_here
|
||||
int cur_owner_local;
|
||||
MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
|
||||
MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
int *Weight;
|
||||
Weight = new int[NN];
|
||||
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
|
||||
|
||||
// Find contiguous run of points with the same owner
|
||||
int jstart = j;
|
||||
while (j < NN && owner_rank[j] == cur_owner_global)
|
||||
j++;
|
||||
int count = (j - jstart) * num_var;
|
||||
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
|
||||
// misc::tillherecheck("print me");
|
||||
// if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
|
||||
|
||||
for (int i = 0; i < NN; i++)
|
||||
{
|
||||
if (Weight[i] > 1)
|
||||
{
|
||||
if (lmyrank == 0)
|
||||
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
|
||||
for (int j = 0; j < num_var; j++)
|
||||
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
|
||||
}
|
||||
#if 0 // for not involved levels, this may fail
|
||||
else if(Weight[i] == 0 && lmyrank == 0)
|
||||
{
|
||||
cout<<"ERROR: Patch::Interp_Points fails to find point (";
|
||||
for(int j=0;j<dim;j++)
|
||||
{
|
||||
cout<<XX[j][i];
|
||||
if(j<dim-1) cout<<",";
|
||||
else cout<<")";
|
||||
}
|
||||
cout<<" on Patch (";
|
||||
for(int j=0;j<dim;j++)
|
||||
{
|
||||
cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
|
||||
if(j<dim-1) cout<<",";
|
||||
else cout<<")--";
|
||||
}
|
||||
cout<<"(";
|
||||
for(int j=0;j<dim;j++)
|
||||
{
|
||||
cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
|
||||
if(j<dim-1) cout<<",";
|
||||
else cout<<")"<<endl;
|
||||
}
|
||||
#if 0
|
||||
checkBlock();
|
||||
#else
|
||||
cout<<"splited domains:"<<endl;
|
||||
{
|
||||
MyList<Block> *Bp=blb;
|
||||
while(Bp)
|
||||
{
|
||||
Block *BP=Bp->data;
|
||||
|
||||
for(int i=0;i<dim;i++)
|
||||
{
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +(ghost_width-0.5)*DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +ghost_width*DH[i];
|
||||
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
cout<<"(";
|
||||
for(int j=0;j<dim;j++)
|
||||
{
|
||||
cout<<llb[j]<<":"<<uub[j];
|
||||
if(j<dim-1) cout<<",";
|
||||
else cout<<")"<<endl;
|
||||
}
|
||||
if(Bp == ble) break;
|
||||
Bp=Bp->next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
MPI_Abort(MPI_COMM_WORLD,1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
MPI_Group_free(&world_group);
|
||||
MPI_Group_free(&local_group);
|
||||
delete[] owner_rank;
|
||||
delete[] shellf;
|
||||
delete[] weight;
|
||||
delete[] Weight;
|
||||
delete[] DH;
|
||||
delete[] llb;
|
||||
delete[] uub;
|
||||
}
|
||||
void Patch::checkBlock()
|
||||
{
|
||||
|
||||
@@ -39,10 +39,6 @@ public:
|
||||
|
||||
bool Find_Point(double *XX);
|
||||
|
||||
void Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetry,
|
||||
int Nmin_consumer, int Nmax_consumer);
|
||||
void Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetry, MPI_Comm Comm_here);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -32,16 +32,6 @@ namespace Parallel
|
||||
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
|
||||
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
|
||||
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
||||
MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
|
||||
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
||||
int ib0_orig, int ib3_orig,
|
||||
int jb1_orig, int jb4_orig,
|
||||
int kb2_orig, int kb5_orig,
|
||||
Patch* PP, int r_left, int r_right,
|
||||
int ingfsi, int fngfsi, bool periodic,
|
||||
Block* &split_first_block, Block* &split_last_block);
|
||||
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
||||
int block_id, int ingfsi, int fngfsi, int lev);
|
||||
void KillBlocks(MyList<Patch> *PatchLIST);
|
||||
|
||||
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
|
||||
@@ -91,43 +81,6 @@ namespace Parallel
|
||||
int Symmetry);
|
||||
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
|
||||
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
|
||||
struct SyncCache {
|
||||
bool valid;
|
||||
int cpusize;
|
||||
MyList<gridseg> **combined_src;
|
||||
MyList<gridseg> **combined_dst;
|
||||
int *send_lengths;
|
||||
int *recv_lengths;
|
||||
double **send_bufs;
|
||||
double **recv_bufs;
|
||||
int *send_buf_caps;
|
||||
int *recv_buf_caps;
|
||||
MPI_Request *reqs;
|
||||
MPI_Status *stats;
|
||||
int max_reqs;
|
||||
bool lengths_valid;
|
||||
SyncCache();
|
||||
void invalidate();
|
||||
void destroy();
|
||||
};
|
||||
|
||||
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
|
||||
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
|
||||
struct AsyncSyncState {
|
||||
int req_no;
|
||||
bool active;
|
||||
AsyncSyncState() : req_no(0), active(false) {}
|
||||
};
|
||||
|
||||
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
||||
SyncCache &cache, AsyncSyncState &state);
|
||||
void Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
||||
MyList<var> *VarList, int Symmetry);
|
||||
void OutBdLow2Hi(Patch *Patc, Patch *Patf,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
|
||||
int Symmetry);
|
||||
@@ -140,15 +93,6 @@ namespace Parallel
|
||||
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
|
||||
int Symmetry);
|
||||
void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
void Prolong(Patch *Patc, Patch *Patf,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
|
||||
int Symmetry);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,7 @@
|
||||
|
||||
#ifndef TWO_PUNCTURES_H
|
||||
#define TWO_PUNCTURES_H
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#define StencilSize 19
|
||||
#define N_PlaneRelax 1
|
||||
#define NRELAX 200
|
||||
@@ -33,7 +32,7 @@ private:
|
||||
int npoints_A, npoints_B, npoints_phi;
|
||||
|
||||
double target_M_plus, target_M_minus;
|
||||
|
||||
|
||||
double admMass;
|
||||
|
||||
double adm_tol;
|
||||
@@ -43,18 +42,6 @@ private:
|
||||
|
||||
int ntotal;
|
||||
|
||||
// ===== Precomputed spectral derivative matrices =====
|
||||
double *D1_A, *D2_A;
|
||||
double *D1_B, *D2_B;
|
||||
double *DF1_phi, *DF2_phi;
|
||||
|
||||
// ===== Pre-allocated workspace for LineRelax (per-thread) =====
|
||||
int max_threads;
|
||||
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
|
||||
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
|
||||
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
|
||||
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
|
||||
|
||||
struct parameters
|
||||
{
|
||||
int nvar, n1, n2, n3;
|
||||
@@ -71,28 +58,6 @@ public:
|
||||
int Newtonmaxit);
|
||||
~TwoPunctures();
|
||||
|
||||
// 02/07: New/modified methods
|
||||
void allocate_workspace();
|
||||
void free_workspace();
|
||||
void precompute_derivative_matrices();
|
||||
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
|
||||
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
|
||||
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
|
||||
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
|
||||
double *l, double *u_ws, double *d, double *y);
|
||||
void LineRelax_be_omp(double *dv,
|
||||
int const i, int const k, int const nvar,
|
||||
int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols, int **cols,
|
||||
double **JFD, int tid);
|
||||
void LineRelax_al_omp(double *dv,
|
||||
int const j, int const k, int const nvar,
|
||||
int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols,
|
||||
int **cols, double **JFD, int tid);
|
||||
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols, int **cols, double **JFD);
|
||||
|
||||
void Solve();
|
||||
void set_initial_guess(derivs v);
|
||||
int index(int i, int j, int k, int l, int a, int b, int c, int d);
|
||||
@@ -151,11 +116,23 @@ public:
|
||||
double BY_KKofxyz(double x, double y, double z);
|
||||
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
|
||||
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
|
||||
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols, int **cols, double **JFD);
|
||||
void LineRelax_be(double *dv,
|
||||
int const i, int const k, int const nvar,
|
||||
int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols, int **cols,
|
||||
double **JFD);
|
||||
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
|
||||
int n3, derivs dv, derivs u, double *values);
|
||||
void LinEquations(double A, double B, double X, double R,
|
||||
double x, double r, double phi,
|
||||
double y, double z, derivs dU, derivs U, double *values);
|
||||
void LineRelax_al(double *dv,
|
||||
int const j, int const k, int const nvar,
|
||||
int const n1, int const n2, int const n3,
|
||||
double const *rhs, int const *ncols,
|
||||
int **cols, double **JFD);
|
||||
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
|
||||
void Save(char *fname);
|
||||
// provided by Vasileios Paschalidis (vpaschal@illinois.edu)
|
||||
@@ -164,4 +141,4 @@ public:
|
||||
void SpecCoef(parameters par, int ivar, double *v, double *cf);
|
||||
};
|
||||
|
||||
#endif /* TWO_PUNCTURES_H */
|
||||
#endif /* TWO_PUNCTURES_H */
|
||||
|
||||
@@ -730,12 +730,6 @@ void bssn_class::Initialize()
|
||||
PhysTime = StartTime;
|
||||
Setup_Black_Hole_position();
|
||||
}
|
||||
|
||||
// Initialize sync caches (per-level, for predictor and corrector)
|
||||
sync_cache_pre = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_cor = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
|
||||
delete Azzz;
|
||||
#endif
|
||||
|
||||
// Destroy sync caches before GH
|
||||
if (sync_cache_pre)
|
||||
{
|
||||
for (int i = 0; i < GH->levels; i++)
|
||||
sync_cache_pre[i].destroy();
|
||||
delete[] sync_cache_pre;
|
||||
}
|
||||
if (sync_cache_cor)
|
||||
{
|
||||
for (int i = 0; i < GH->levels; i++)
|
||||
sync_cache_cor[i].destroy();
|
||||
delete[] sync_cache_cor;
|
||||
}
|
||||
if (sync_cache_rp_coarse)
|
||||
{
|
||||
for (int i = 0; i < GH->levels; i++)
|
||||
sync_cache_rp_coarse[i].destroy();
|
||||
delete[] sync_cache_rp_coarse;
|
||||
}
|
||||
if (sync_cache_rp_fine)
|
||||
{
|
||||
for (int i = 0; i < GH->levels; i++)
|
||||
sync_cache_rp_fine[i].destroy();
|
||||
delete[] sync_cache_rp_fine;
|
||||
}
|
||||
|
||||
delete GH;
|
||||
#ifdef WithShell
|
||||
delete SH;
|
||||
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
|
||||
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
#endif
|
||||
|
||||
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
||||
@@ -2426,10 +2393,9 @@ void bssn_class::RecursiveStep(int lev)
|
||||
#endif
|
||||
|
||||
#if (REGLEV == 0)
|
||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2605,10 +2571,9 @@ void bssn_class::ParallelStep()
|
||||
delete[] tporg;
|
||||
delete[] tporgo;
|
||||
#if (REGLEV == 0)
|
||||
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2772,10 +2737,9 @@ void bssn_class::ParallelStep()
|
||||
if (lev + 1 >= GH->movls)
|
||||
{
|
||||
// GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
|
||||
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
|
||||
|
||||
// a_stream.clear();
|
||||
// a_stream.str("");
|
||||
@@ -2787,10 +2751,9 @@ void bssn_class::ParallelStep()
|
||||
// for this level
|
||||
if (YN == 1)
|
||||
{
|
||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||
|
||||
// a_stream.clear();
|
||||
// a_stream.str("");
|
||||
@@ -2806,10 +2769,9 @@ void bssn_class::ParallelStep()
|
||||
if (YN == 1)
|
||||
{
|
||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||
|
||||
// a_stream.clear();
|
||||
// a_stream.str("");
|
||||
@@ -2822,10 +2784,9 @@ void bssn_class::ParallelStep()
|
||||
if (i % 4 == 3)
|
||||
{
|
||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||
|
||||
// a_stream.clear();
|
||||
// a_stream.str("");
|
||||
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WithShell
|
||||
// evolve Shell Patches
|
||||
@@ -3215,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||
#endif
|
||||
|
||||
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
|
||||
#endif
|
||||
}
|
||||
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Parallel::AsyncSyncState async_pre;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -3363,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (MAPBH == 0)
|
||||
// for black hole position
|
||||
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WithShell
|
||||
// evolve Shell Patches
|
||||
@@ -3579,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||
#elif (AGM == 1)
|
||||
if (iter_count == 3)
|
||||
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
|
||||
sPp = sPp->next;
|
||||
}
|
||||
}
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req_cor;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#"
|
||||
<< iter_count << " variables at t = "
|
||||
<< PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Parallel::AsyncSyncState async_cor;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -3723,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (MAPBH == 0)
|
||||
// for black hole position
|
||||
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WithShell
|
||||
// evolve Shell Patches
|
||||
@@ -4077,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||
#endif
|
||||
|
||||
if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[fngfs + ShellPatch::gx],
|
||||
cg->fgfs[fngfs + ShellPatch::gy],
|
||||
cg->fgfs[fngfs + ShellPatch::gx],
|
||||
cg->fgfs[fngfs + ShellPatch::gy],
|
||||
cg->fgfs[fngfs + ShellPatch::gz],
|
||||
cg->fgfs[fngfs + ShellPatch::drhodx],
|
||||
cg->fgfs[fngfs + ShellPatch::drhody],
|
||||
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = "
|
||||
<< PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Parallel::AsyncSyncState async_pre;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -4222,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WithShell
|
||||
// evolve Shell Patches
|
||||
@@ -4423,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||
#elif (AGM == 1)
|
||||
if (iter_count == 3)
|
||||
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
|
||||
sPp = sPp->next;
|
||||
}
|
||||
}
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req_cor;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
Parallel::AsyncSyncState async_cor;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -4567,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||
|
||||
#ifdef WithShell
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
|
||||
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
|
||||
}
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
}
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
|
||||
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
||||
|
||||
#if (MAPBH == 0)
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
@@ -5157,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
|
||||
|
||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||
MPI_Request err_req_cor;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
|
||||
}
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
|
||||
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
|
||||
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
|
||||
|
||||
#if (MAPBH == 0)
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
|
||||
#endif
|
||||
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency
|
||||
MPI_Request err_req;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -5484,25 +5473,12 @@ void bssn_class::SHStep()
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// corrector
|
||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||
{
|
||||
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
|
||||
sPp = sPp->next;
|
||||
}
|
||||
}
|
||||
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency
|
||||
MPI_Request err_req_cor;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
@@ -5661,26 +5647,12 @@ void bssn_class::SHStep()
|
||||
{
|
||||
prev_clock = curr_clock;
|
||||
curr_clock = clock();
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
cout << " Shell stuff synchronization used "
|
||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||
<< " seconds! " << endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
{
|
||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||
if (myrank == 0)
|
||||
{
|
||||
if (ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
|
||||
<< " variables at t = " << PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
sPp = SH->PatL;
|
||||
while (sPp)
|
||||
{
|
||||
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// a_stream.clear();
|
||||
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
|
||||
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// a_stream.clear();
|
||||
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
|
||||
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
||||
#endif
|
||||
}
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
|
||||
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
|
||||
|
||||
#if (PSTR == 1 || PSTR == 2)
|
||||
// a_stream.clear();
|
||||
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
|
||||
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
|
||||
#endif
|
||||
}
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
|
||||
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
|
||||
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
|
||||
#endif
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
|
||||
#endif
|
||||
}
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
||||
}
|
||||
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
|
||||
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
||||
else // no time refinement levels and for all same time levels
|
||||
{
|
||||
#if (RPB == 0)
|
||||
Ppc = GH->PatL[lev - 1];
|
||||
while (Ppc)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
#if (MIXOUTB == 0)
|
||||
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
#elif (MIXOUTB == 1)
|
||||
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
|
||||
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
|
||||
#endif
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Ppc = Ppc->next;
|
||||
}
|
||||
#elif (RPB == 1)
|
||||
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
|
||||
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
|
||||
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
|
||||
#else
|
||||
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
|
||||
#endif
|
||||
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
|
||||
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
|
||||
}
|
||||
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
}
|
||||
}
|
||||
#undef MIXOUTB
|
||||
|
||||
@@ -126,11 +126,6 @@ public:
|
||||
MyList<var> *OldStateList, *DumpList;
|
||||
MyList<var> *ConstraintList;
|
||||
|
||||
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
|
||||
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
|
||||
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
|
||||
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
|
||||
|
||||
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
||||
monitor *ConVMonitor;
|
||||
surface_integral *Waveshell;
|
||||
|
||||
@@ -106,8 +106,7 @@
|
||||
call getpbh(BHN,Porg,Mass)
|
||||
#endif
|
||||
|
||||
!!! sanity check (disabled in production builds for performance)
|
||||
#ifdef DEBUG
|
||||
!!! sanity check
|
||||
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
|
||||
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
|
||||
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
|
||||
@@ -137,7 +136,6 @@
|
||||
gont = 1
|
||||
return
|
||||
endif
|
||||
#endif
|
||||
|
||||
PI = dacos(-ONE)
|
||||
|
||||
@@ -945,60 +943,103 @@
|
||||
SSA(2)=SYM
|
||||
SSA(3)=ANTI
|
||||
|
||||
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
|
||||
! lopsided_kodis shares the symmetry_bd buffer between advection and
|
||||
! dissipation, eliminating redundant full-grid copies. For metric variables
|
||||
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
|
||||
! so the constant offset has no effect on dissipation.
|
||||
!!!!!!!!!advection term part
|
||||
|
||||
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
|
||||
call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
|
||||
call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
|
||||
call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
|
||||
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
|
||||
call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
|
||||
call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
|
||||
call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
|
||||
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
|
||||
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
|
||||
|
||||
#if 1
|
||||
!! bam does not apply dissipation on gauge variables
|
||||
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
|
||||
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
|
||||
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
|
||||
#endif
|
||||
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
|
||||
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
|
||||
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
|
||||
#endif
|
||||
#else
|
||||
! No dissipation on gauge variables (advection only)
|
||||
call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
|
||||
call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
|
||||
call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
|
||||
!!
|
||||
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
|
||||
|
||||
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
|
||||
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
|
||||
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
|
||||
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
|
||||
#endif
|
||||
|
||||
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
|
||||
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
|
||||
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
|
||||
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
|
||||
#endif
|
||||
|
||||
if(eps>0)then
|
||||
! usual Kreiss-Oliger dissipation
|
||||
call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
|
||||
#if 0
|
||||
#define i 42
|
||||
#define j 40
|
||||
#define k 40
|
||||
if(Lev == 1)then
|
||||
write(*,*) X(i),Y(j),Z(k)
|
||||
write(*,*) "before",Axx_rhs(i,j,k)
|
||||
endif
|
||||
#undef i
|
||||
#undef j
|
||||
#undef k
|
||||
!!stop
|
||||
#endif
|
||||
call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
|
||||
#if 0
|
||||
#define i 42
|
||||
#define j 40
|
||||
#define k 40
|
||||
if(Lev == 1)then
|
||||
write(*,*) X(i),Y(j),Z(k)
|
||||
write(*,*) "after",Axx_rhs(i,j,k)
|
||||
endif
|
||||
#undef i
|
||||
#undef j
|
||||
#undef k
|
||||
!!stop
|
||||
#endif
|
||||
call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
|
||||
|
||||
#if 1
|
||||
!! bam does not apply dissipation on gauge variables
|
||||
call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
|
||||
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
|
||||
call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
|
||||
call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
endif
|
||||
|
||||
if(co == 0)then
|
||||
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -130,11 +130,7 @@ void cgh::compose_cgh(int nprocs)
|
||||
for (int lev = 0; lev < levels; lev++)
|
||||
{
|
||||
checkPatchList(PatL[lev], false);
|
||||
#ifdef INTERP_LB_OPTIMIZE
|
||||
Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
|
||||
#else
|
||||
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
|
||||
#endif
|
||||
#if (RPB == 1)
|
||||
// we need distributed box of PatL[lev] and PatL[lev-1]
|
||||
if (lev > 0)
|
||||
@@ -1305,13 +1301,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
|
||||
}
|
||||
|
||||
|
||||
bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||
void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||
MyList<var> *OldList, MyList<var> *StateList,
|
||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||
monitor *ErrorMonitor)
|
||||
{
|
||||
if (lev < movls)
|
||||
return false;
|
||||
return;
|
||||
|
||||
#if (0)
|
||||
// #if (PSTR == 1 || PSTR == 2)
|
||||
@@ -1400,7 +1396,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
||||
for (bhi = 0; bhi < BH_num; bhi++)
|
||||
delete[] tmpPorg[bhi];
|
||||
delete[] tmpPorg;
|
||||
return false;
|
||||
return;
|
||||
}
|
||||
// x direction
|
||||
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
|
||||
@@ -1504,7 +1500,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
||||
for (int bhi = 0; bhi < BH_num; bhi++)
|
||||
delete[] tmpPorg[bhi];
|
||||
delete[] tmpPorg;
|
||||
return tot_flag;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
MyList<var> *OldList, MyList<var> *StateList,
|
||||
MyList<var> *FutureList, MyList<var> *tmList,
|
||||
int Symmetry, bool BB);
|
||||
bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||
MyList<var> *OldList, MyList<var> *StateList,
|
||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||
monitor *ErrorMonitor);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -18,61 +18,49 @@
|
||||
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
|
||||
|
||||
!~~~~~~~> Local variable:
|
||||
|
||||
integer :: i,j,k
|
||||
real*8 :: lgxx,lgyy,lgzz,ldetg
|
||||
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
|
||||
real*8 :: ltrA,lscale
|
||||
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
|
||||
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
|
||||
|
||||
!~~~~~~>
|
||||
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
gxx = dxx + ONE
|
||||
gyy = dyy + ONE
|
||||
gzz = dzz + ONE
|
||||
|
||||
lgxx = dxx(i,j,k) + ONE
|
||||
lgyy = dyy(i,j,k) + ONE
|
||||
lgzz = dzz(i,j,k) + ONE
|
||||
detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
|
||||
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
|
||||
gupxx = ( gyy * gzz - gyz * gyz ) / detg
|
||||
gupxy = - ( gxy * gzz - gyz * gxz ) / detg
|
||||
gupxz = ( gxy * gyz - gyy * gxz ) / detg
|
||||
gupyy = ( gxx * gzz - gxz * gxz ) / detg
|
||||
gupyz = - ( gxx * gyz - gxy * gxz ) / detg
|
||||
gupzz = ( gxx * gyy - gxy * gxy ) / detg
|
||||
|
||||
ldetg = lgxx * lgyy * lgzz &
|
||||
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
|
||||
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
|
||||
- gxz(i,j,k) * lgyy * gxz(i,j,k) &
|
||||
- gxy(i,j,k) * gxy(i,j,k) * lgzz &
|
||||
- lgxx * gyz(i,j,k) * gyz(i,j,k)
|
||||
trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
|
||||
+ TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
|
||||
|
||||
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
|
||||
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
|
||||
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
|
||||
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
|
||||
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
|
||||
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
|
||||
Axx = Axx - F1o3 * gxx * trA
|
||||
Axy = Axy - F1o3 * gxy * trA
|
||||
Axz = Axz - F1o3 * gxz * trA
|
||||
Ayy = Ayy - F1o3 * gyy * trA
|
||||
Ayz = Ayz - F1o3 * gyz * trA
|
||||
Azz = Azz - F1o3 * gzz * trA
|
||||
|
||||
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
|
||||
+ lgupzz * Azz(i,j,k) &
|
||||
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
|
||||
+ lgupyz * Ayz(i,j,k))
|
||||
detg = ONE / ( detg ** F1o3 )
|
||||
|
||||
gxx = gxx * detg
|
||||
gxy = gxy * detg
|
||||
gxz = gxz * detg
|
||||
gyy = gyy * detg
|
||||
gyz = gyz * detg
|
||||
gzz = gzz * detg
|
||||
|
||||
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
|
||||
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
|
||||
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
|
||||
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
|
||||
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
|
||||
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
|
||||
|
||||
lscale = ONE / ( ldetg ** F1o3 )
|
||||
|
||||
dxx(i,j,k) = lgxx * lscale - ONE
|
||||
gxy(i,j,k) = gxy(i,j,k) * lscale
|
||||
gxz(i,j,k) = gxz(i,j,k) * lscale
|
||||
dyy(i,j,k) = lgyy * lscale - ONE
|
||||
gyz(i,j,k) = gyz(i,j,k) * lscale
|
||||
dzz(i,j,k) = lgzz * lscale - ONE
|
||||
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
dxx = gxx - ONE
|
||||
dyy = gyy - ONE
|
||||
dzz = gzz - ONE
|
||||
|
||||
return
|
||||
|
||||
@@ -94,71 +82,51 @@
|
||||
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
|
||||
|
||||
!~~~~~~~> Local variable:
|
||||
|
||||
integer :: i,j,k
|
||||
real*8 :: lgxx,lgyy,lgzz,lscale
|
||||
real*8 :: lgxy,lgxz,lgyz
|
||||
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
|
||||
real*8 :: ltrA
|
||||
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: trA
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
|
||||
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
|
||||
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
|
||||
|
||||
!~~~~~~>
|
||||
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
gxx = dxx + ONE
|
||||
gyy = dyy + ONE
|
||||
gzz = dzz + ONE
|
||||
! for g
|
||||
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
|
||||
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
|
||||
|
||||
! for g: normalize determinant first
|
||||
lgxx = dxx(i,j,k) + ONE
|
||||
lgyy = dyy(i,j,k) + ONE
|
||||
lgzz = dzz(i,j,k) + ONE
|
||||
lgxy = gxy(i,j,k)
|
||||
lgxz = gxz(i,j,k)
|
||||
lgyz = gyz(i,j,k)
|
||||
gupzz = ONE / ( gupzz ** F1o3 )
|
||||
|
||||
gxx = gxx * gupzz
|
||||
gxy = gxy * gupzz
|
||||
gxz = gxz * gupzz
|
||||
gyy = gyy * gupzz
|
||||
gyz = gyz * gupzz
|
||||
gzz = gzz * gupzz
|
||||
|
||||
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
|
||||
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
|
||||
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
|
||||
dxx = gxx - ONE
|
||||
dyy = gyy - ONE
|
||||
dzz = gzz - ONE
|
||||
! for A
|
||||
|
||||
lscale = ONE / ( lscale ** F1o3 )
|
||||
gupxx = ( gyy * gzz - gyz * gyz )
|
||||
gupxy = - ( gxy * gzz - gyz * gxz )
|
||||
gupxz = ( gxy * gyz - gyy * gxz )
|
||||
gupyy = ( gxx * gzz - gxz * gxz )
|
||||
gupyz = - ( gxx * gyz - gxy * gxz )
|
||||
gupzz = ( gxx * gyy - gxy * gxy )
|
||||
|
||||
lgxx = lgxx * lscale
|
||||
lgxy = lgxy * lscale
|
||||
lgxz = lgxz * lscale
|
||||
lgyy = lgyy * lscale
|
||||
lgyz = lgyz * lscale
|
||||
lgzz = lgzz * lscale
|
||||
trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
|
||||
+ TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
|
||||
|
||||
dxx(i,j,k) = lgxx - ONE
|
||||
gxy(i,j,k) = lgxy
|
||||
gxz(i,j,k) = lgxz
|
||||
dyy(i,j,k) = lgyy - ONE
|
||||
gyz(i,j,k) = lgyz
|
||||
dzz(i,j,k) = lgzz - ONE
|
||||
|
||||
! for A: trace-free using normalized metric (det=1, no division needed)
|
||||
lgupxx = ( lgyy * lgzz - lgyz * lgyz )
|
||||
lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
|
||||
lgupxz = ( lgxy * lgyz - lgyy * lgxz )
|
||||
lgupyy = ( lgxx * lgzz - lgxz * lgxz )
|
||||
lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
|
||||
lgupzz = ( lgxx * lgyy - lgxy * lgxy )
|
||||
|
||||
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
|
||||
+ lgupzz * Azz(i,j,k) &
|
||||
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
|
||||
+ lgupyz * Ayz(i,j,k))
|
||||
|
||||
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
|
||||
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
|
||||
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
|
||||
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
|
||||
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
|
||||
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
|
||||
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
Axx = Axx - F1o3 * gxx * trA
|
||||
Axy = Axy - F1o3 * gxy * trA
|
||||
Axz = Axz - F1o3 * gxz * trA
|
||||
Ayy = Ayy - F1o3 * gyy * trA
|
||||
Ayz = Ayz - F1o3 * gyz * trA
|
||||
Azz = Azz - F1o3 * gzz * trA
|
||||
|
||||
return
|
||||
|
||||
|
||||
@@ -1,268 +0,0 @@
|
||||
#include "tool.h"
|
||||
void fdderivs(const int ex[3],
|
||||
const double *f,
|
||||
double *fxx, double *fxy, double *fxz,
|
||||
double *fyy, double *fyz, double *fzz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double SYM1, double SYM2, double SYM3,
|
||||
int Symmetry, int onoff)
|
||||
{
|
||||
(void)onoff;
|
||||
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
||||
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
|
||||
const double F1o4 = 2.5e-1; // 1/4
|
||||
const double F8 = 8.0;
|
||||
const double F16 = 16.0;
|
||||
const double F30 = 30.0;
|
||||
const double F1o12 = ONE / 12.0;
|
||||
const double F1o144 = ONE / 144.0;
|
||||
|
||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
|
||||
const int imaxF = ex1;
|
||||
const int jmaxF = ex2;
|
||||
const int kmaxF = ex3;
|
||||
|
||||
int iminF = 1, jminF = 1, kminF = 1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
||||
|
||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
||||
|
||||
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
|
||||
const size_t nx = (size_t)ex1 + 2;
|
||||
const size_t ny = (size_t)ex2 + 2;
|
||||
const size_t nz = (size_t)ex3 + 2;
|
||||
const size_t fh_size = nx * ny * nz;
|
||||
|
||||
static double *fh = NULL;
|
||||
static size_t cap = 0;
|
||||
|
||||
if (fh_size > cap) {
|
||||
free(fh);
|
||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
||||
cap = fh_size;
|
||||
}
|
||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||
if (!fh) return;
|
||||
|
||||
symmetry_bd(2, ex, f, fh, SoA);
|
||||
|
||||
/* 系数:按 Fortran 原式 */
|
||||
const double Sdxdx = ONE / (dX * dX);
|
||||
const double Sdydy = ONE / (dY * dY);
|
||||
const double Sdzdz = ONE / (dZ * dZ);
|
||||
|
||||
const double Fdxdx = F1o12 / (dX * dX);
|
||||
const double Fdydy = F1o12 / (dY * dY);
|
||||
const double Fdzdz = F1o12 / (dZ * dZ);
|
||||
|
||||
const double Sdxdy = F1o4 / (dX * dY);
|
||||
const double Sdxdz = F1o4 / (dX * dZ);
|
||||
const double Sdydz = F1o4 / (dY * dZ);
|
||||
|
||||
const double Fdxdy = F1o144 / (dX * dY);
|
||||
const double Fdxdz = F1o144 / (dX * dZ);
|
||||
const double Fdydz = F1o144 / (dY * dZ);
|
||||
|
||||
/* 输出清零:fxx,fyy,fzz,fxy,fxz,fyz = 0 */
|
||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
||||
for (size_t p = 0; p < all; ++p) {
|
||||
fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
|
||||
fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fortran:
|
||||
* do k=1,ex3-1
|
||||
* do j=1,ex2-1
|
||||
* do i=1,ex1-1
|
||||
*/
|
||||
|
||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
/* 高阶分支:i±2,j±2,k±2 都在范围内 */
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
fxx[p] = Fdxdx * (
|
||||
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fyy[p] = Fdydy * (
|
||||
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fzz[p] = Fdzdz * (
|
||||
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
|
||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||
);
|
||||
|
||||
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
|
||||
{
|
||||
const double t_jm2 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
|
||||
|
||||
const double t_jm1 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
|
||||
|
||||
const double t_jp1 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
|
||||
|
||||
const double t_jp2 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
|
||||
|
||||
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
|
||||
}
|
||||
|
||||
/* fxz 高阶 */
|
||||
{
|
||||
const double t_km2 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
|
||||
|
||||
const double t_km1 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
|
||||
|
||||
const double t_kp1 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
|
||||
|
||||
const double t_kp2 =
|
||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
|
||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
|
||||
|
||||
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
||||
}
|
||||
|
||||
/* fyz 高阶 */
|
||||
{
|
||||
const double t_km2 =
|
||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
|
||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
|
||||
|
||||
const double t_km1 =
|
||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
|
||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
|
||||
|
||||
const double t_kp1 =
|
||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
|
||||
|
||||
const double t_kp2 =
|
||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
|
||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
|
||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
|
||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
|
||||
|
||||
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
||||
}
|
||||
}
|
||||
/* 二阶分支:i±1,j±1,k±1 在范围内 */
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
fxx[p] = Sdxdx * (
|
||||
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fyy[p] = Sdydy * (
|
||||
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fzz[p] = Sdzdz * (
|
||||
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||
);
|
||||
|
||||
fxy[p] = Sdxdy * (
|
||||
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fxz[p] = Sdxdz * (
|
||||
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
|
||||
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
||||
);
|
||||
|
||||
fyz[p] = Sdydz * (
|
||||
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
||||
);
|
||||
}else{
|
||||
fxx[p] = 0.0;
|
||||
fyy[p] = 0.0;
|
||||
fzz[p] = 0.0;
|
||||
fxy[p] = 0.0;
|
||||
fxz[p] = 0.0;
|
||||
fyz[p] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free(fh);
|
||||
}
|
||||
@@ -1,150 +0,0 @@
|
||||
#include "tool.h"
|
||||
|
||||
/*
|
||||
* C 版 fderivs
|
||||
*
|
||||
* Fortran:
|
||||
* subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
|
||||
*
|
||||
* 约定:
|
||||
* f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局
|
||||
* X: ex1, Y: ex2, Z: ex3
|
||||
*/
|
||||
void fderivs(const int ex[3],
|
||||
const double *f,
|
||||
double *fx, double *fy, double *fz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double SYM1, double SYM2, double SYM3,
|
||||
int Symmetry, int onoff)
|
||||
{
|
||||
(void)onoff; // Fortran 里没用到
|
||||
|
||||
const double ZEO = 0.0, ONE = 1.0;
|
||||
const double TWO = 2.0, EIT = 8.0;
|
||||
const double F12 = 12.0;
|
||||
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
|
||||
|
||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||
|
||||
// dX = X(2)-X(1) -> C: X[1]-X[0]
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
|
||||
// Fortran 1-based bounds
|
||||
const int imaxF = ex1;
|
||||
const int jmaxF = ex2;
|
||||
const int kmaxF = ex3;
|
||||
|
||||
int iminF = 1, jminF = 1, kminF = 1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
||||
|
||||
// SoA(1:3) = SYM1,SYM2,SYM3
|
||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
||||
|
||||
// fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
|
||||
const size_t nx = (size_t)ex1 + 2;
|
||||
const size_t ny = (size_t)ex2 + 2;
|
||||
const size_t nz = (size_t)ex3 + 2;
|
||||
const size_t fh_size = nx * ny * nz;
|
||||
static double *fh = NULL;
|
||||
static size_t cap = 0;
|
||||
|
||||
if (fh_size > cap) {
|
||||
free(fh);
|
||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
||||
cap = fh_size;
|
||||
}
|
||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||
if (!fh) return;
|
||||
|
||||
// call symmetry_bd(2,ex,f,fh,SoA)
|
||||
symmetry_bd(2, ex, f, fh, SoA);
|
||||
|
||||
const double d12dx = ONE / F12 / dX;
|
||||
const double d12dy = ONE / F12 / dY;
|
||||
const double d12dz = ONE / F12 / dZ;
|
||||
|
||||
const double d2dx = ONE / TWO / dX;
|
||||
const double d2dy = ONE / TWO / dY;
|
||||
const double d2dz = ONE / TWO / dZ;
|
||||
|
||||
// fx = fy = fz = 0
|
||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
||||
for (size_t p = 0; p < all; ++p) {
|
||||
fx[p] = ZEO;
|
||||
fy[p] = ZEO;
|
||||
fz[p] = ZEO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fortran loops:
|
||||
* do k=1,ex3-1
|
||||
* do j=1,ex2-1
|
||||
* do i=1,ex1-1
|
||||
*
|
||||
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
|
||||
*/
|
||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
fx[p] = d12dx * (
|
||||
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
||||
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fy[p] = d12dy * (
|
||||
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
|
||||
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
||||
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
|
||||
);
|
||||
|
||||
fz[p] = d12dz * (
|
||||
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
|
||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
||||
);
|
||||
}
|
||||
// elseif(i+1 <= imax .and. i-1 >= imin ...)
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
fx[p] = d2dx * (
|
||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||
);
|
||||
|
||||
fy[p] = d2dy * (
|
||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||
);
|
||||
|
||||
fz[p] = d2dz * (
|
||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// free(fh);
|
||||
}
|
||||
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
|
||||
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
|
||||
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
|
||||
@@ -883,17 +886,14 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
|
||||
enddo
|
||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
||||
do i=0,ord-1
|
||||
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
|
||||
enddo
|
||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
||||
do i=0,ord-1
|
||||
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
||||
enddo
|
||||
@@ -912,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
|
||||
@@ -940,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
|
||||
|
||||
integer::i
|
||||
|
||||
funcc = 0.d0
|
||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||
do i=0,ord-1
|
||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
|
||||
@@ -1115,9 +1117,7 @@ end subroutine d2dump
|
||||
!------------------------------------------------------------------------------
|
||||
! Lagrangian polynomial interpolation
|
||||
!------------------------------------------------------------------------------
|
||||
|
||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
||||
subroutine polint(xa, ya, x, y, dy, ordn)
|
||||
subroutine polint(xa, ya, x, y, dy, ordn)
|
||||
implicit none
|
||||
|
||||
integer, intent(in) :: ordn
|
||||
@@ -1129,13 +1129,15 @@ end subroutine d2dump
|
||||
real*8, dimension(ordn) :: c, d, ho
|
||||
real*8 :: dif, dift, hp, h, den_val
|
||||
|
||||
! Initialization
|
||||
c = ya
|
||||
d = ya
|
||||
ho = xa - x
|
||||
|
||||
|
||||
ns = 1
|
||||
dif = abs(x - xa(1))
|
||||
|
||||
|
||||
! Find the index of the closest table entry
|
||||
do i = 2, ordn
|
||||
dift = abs(x - xa(i))
|
||||
if (dift < dif) then
|
||||
@@ -1146,26 +1148,31 @@ end subroutine d2dump
|
||||
|
||||
y = ya(ns)
|
||||
ns = ns - 1
|
||||
|
||||
|
||||
! Main Neville's algorithm loop
|
||||
do m = 1, ordn - 1
|
||||
n_m = ordn - m
|
||||
do i = 1, n_m
|
||||
hp = ho(i)
|
||||
h = ho(i+m)
|
||||
den_val = hp - h
|
||||
|
||||
|
||||
! Check for division by zero locally
|
||||
if (den_val == 0.0d0) then
|
||||
write(*,*) 'failure in polint for point',x
|
||||
write(*,*) 'with input points: ',xa
|
||||
stop
|
||||
end if
|
||||
|
||||
|
||||
! Reuse den_val to avoid redundant divisions
|
||||
den_val = (c(i+1) - d(i)) / den_val
|
||||
|
||||
|
||||
! Update c and d in place
|
||||
d(i) = h * den_val
|
||||
c(i) = hp * den_val
|
||||
end do
|
||||
|
||||
! Decide which path (up or down the tableau) to take
|
||||
if (2 * ns < n_m) then
|
||||
dy = c(ns + 1)
|
||||
else
|
||||
@@ -1182,92 +1189,68 @@ end subroutine d2dump
|
||||
! interpolation in 2 dimensions, follow yx order
|
||||
!
|
||||
!------------------------------------------------------------------------------
|
||||
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
|
||||
implicit none
|
||||
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
|
||||
implicit none
|
||||
integer,intent(in) :: ordn
|
||||
real*8, dimension(ordn), intent(in) :: x1a,x2a
|
||||
real*8, dimension(ordn,ordn), intent(in) :: ya
|
||||
real*8, intent(in) :: x1,x2
|
||||
real*8, intent(out) :: y,dy
|
||||
|
||||
integer,intent(in) :: ordn
|
||||
real*8, dimension(1:ordn), intent(in) :: x1a,x2a
|
||||
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
|
||||
real*8, intent(in) :: x1,x2
|
||||
real*8, intent(out) :: y,dy
|
||||
integer :: j
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8 :: dy_temp ! Local variable to prevent overwriting result
|
||||
|
||||
#ifdef POLINT_LEGACY_ORDER
|
||||
integer :: i,m
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8, dimension(ordn) :: yntmp
|
||||
! Optimized sequence: Loop over columns (j)
|
||||
! ya(:,j) is a contiguous memory block in Fortran
|
||||
do j=1,ordn
|
||||
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
|
||||
end do
|
||||
|
||||
m=size(x1a)
|
||||
do i=1,m
|
||||
yntmp=ya(i,:)
|
||||
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
|
||||
end do
|
||||
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
||||
#else
|
||||
integer :: j
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8 :: dy_temp
|
||||
! Final interpolation on the results
|
||||
call polint(x2a, ymtmp, x2, y, dy, ordn)
|
||||
|
||||
do j=1,ordn
|
||||
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
|
||||
end do
|
||||
call polint(x2a, ymtmp, x2, y, dy, ordn)
|
||||
#endif
|
||||
|
||||
return
|
||||
return
|
||||
end subroutine polin2
|
||||
!------------------------------------------------------------------------------
|
||||
!
|
||||
! interpolation in 3 dimensions, follow zyx order
|
||||
!
|
||||
!------------------------------------------------------------------------------
|
||||
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
|
||||
implicit none
|
||||
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
|
||||
implicit none
|
||||
integer,intent(in) :: ordn
|
||||
real*8, dimension(ordn), intent(in) :: x1a,x2a,x3a
|
||||
real*8, dimension(ordn,ordn,ordn), intent(in) :: ya
|
||||
real*8, intent(in) :: x1,x2,x3
|
||||
real*8, intent(out) :: y,dy
|
||||
|
||||
integer,intent(in) :: ordn
|
||||
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
|
||||
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
|
||||
real*8, intent(in) :: x1,x2,x3
|
||||
real*8, intent(out) :: y,dy
|
||||
integer :: j, k
|
||||
real*8, dimension(ordn,ordn) :: yatmp
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8 :: dy_temp
|
||||
|
||||
#ifdef POLINT_LEGACY_ORDER
|
||||
integer :: i,j,m,n
|
||||
real*8, dimension(ordn,ordn) :: yatmp
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8, dimension(ordn) :: yntmp
|
||||
real*8, dimension(ordn) :: yqtmp
|
||||
|
||||
m=size(x1a)
|
||||
n=size(x2a)
|
||||
do i=1,m
|
||||
do j=1,n
|
||||
yqtmp=ya(i,j,:)
|
||||
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
|
||||
end do
|
||||
yntmp=yatmp(i,:)
|
||||
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
|
||||
end do
|
||||
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
||||
#else
|
||||
integer :: j, k
|
||||
real*8, dimension(ordn,ordn) :: yatmp
|
||||
real*8, dimension(ordn) :: ymtmp
|
||||
real*8 :: dy_temp
|
||||
|
||||
do k=1,ordn
|
||||
do j=1,ordn
|
||||
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
|
||||
! Sequence change: Process the contiguous first dimension (x1) first.
|
||||
! We loop through the 'slow' planes (j, k) to extract 'fast' columns.
|
||||
do k=1,ordn
|
||||
do j=1,ordn
|
||||
! ya(:,j,k) is contiguous; much faster than ya(i,j,:)
|
||||
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
|
||||
end do
|
||||
end do
|
||||
end do
|
||||
do k=1,ordn
|
||||
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
|
||||
end do
|
||||
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
||||
#endif
|
||||
|
||||
return
|
||||
! Now process the second dimension
|
||||
do k=1,ordn
|
||||
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
|
||||
end do
|
||||
|
||||
! Final dimension
|
||||
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
||||
|
||||
return
|
||||
end subroutine polin3
|
||||
!--------------------------------------------------------------------------------------
|
||||
! calculate L2norm
|
||||
! calculate L2norm
|
||||
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
|
||||
f,f_out,gw)
|
||||
|
||||
@@ -1284,9 +1267,7 @@ end subroutine d2dump
|
||||
real*8 :: dX, dY, dZ
|
||||
integer::imin,jmin,kmin
|
||||
integer::imax,jmax,kmax
|
||||
integer::i,j,k,n_elements
|
||||
real*8, dimension(:), allocatable :: f_flat
|
||||
real*8, external :: DDOT
|
||||
integer::i,j,k
|
||||
|
||||
dX = X(2) - X(1)
|
||||
dY = Y(2) - Y(1)
|
||||
@@ -1310,12 +1291,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
|
||||
if(dabs(Y(1)-ymin) < dY) jmin = 1
|
||||
if(dabs(Z(1)-zmin) < dZ) kmin = 1
|
||||
|
||||
! Optimized with oneMKL BLAS DDOT for dot product
|
||||
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||
allocate(f_flat(n_elements))
|
||||
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
|
||||
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
|
||||
deallocate(f_flat)
|
||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
||||
|
||||
f_out = f_out*dX*dY*dZ
|
||||
|
||||
@@ -1340,9 +1316,7 @@ f_out = f_out*dX*dY*dZ
|
||||
real*8 :: dX, dY, dZ
|
||||
integer::imin,jmin,kmin
|
||||
integer::imax,jmax,kmax
|
||||
integer::i,j,k,n_elements
|
||||
real*8, dimension(:), allocatable :: f_flat
|
||||
real*8, external :: DDOT
|
||||
integer::i,j,k
|
||||
|
||||
real*8 :: PIo4
|
||||
|
||||
@@ -1405,12 +1379,7 @@ if(Symmetry==2)then
|
||||
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
||||
endif
|
||||
|
||||
! Optimized with oneMKL BLAS DDOT for dot product
|
||||
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||
allocate(f_flat(n_elements))
|
||||
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
|
||||
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
|
||||
deallocate(f_flat)
|
||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
||||
|
||||
f_out = f_out*dX*dY*dZ
|
||||
|
||||
@@ -1438,8 +1407,6 @@ f_out = f_out*dX*dY*dZ
|
||||
integer::imin,jmin,kmin
|
||||
integer::imax,jmax,kmax
|
||||
integer::i,j,k
|
||||
real*8, dimension(:), allocatable :: f_flat
|
||||
real*8, external :: DDOT
|
||||
|
||||
real*8 :: PIo4
|
||||
|
||||
@@ -1502,12 +1469,11 @@ if(Symmetry==2)then
|
||||
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
|
||||
endif
|
||||
|
||||
! Optimized with oneMKL BLAS DDOT for dot product
|
||||
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
|
||||
|
||||
f_out = f_out
|
||||
|
||||
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
|
||||
allocate(f_flat(Nout))
|
||||
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
|
||||
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
|
||||
deallocate(f_flat)
|
||||
|
||||
return
|
||||
|
||||
@@ -1705,7 +1671,6 @@ deallocate(f_flat)
|
||||
real*8, dimension(ORDN,ORDN) :: tmp2
|
||||
real*8, dimension(ORDN) :: tmp1
|
||||
real*8, dimension(3) :: SoAh
|
||||
real*8, external :: DDOT
|
||||
|
||||
! +1 because c++ gives 0 for first point
|
||||
cxB = inds+1
|
||||
@@ -1741,21 +1706,20 @@ deallocate(f_flat)
|
||||
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
|
||||
endif
|
||||
|
||||
! Optimized with BLAS operations for better performance
|
||||
! First dimension: z-direction weighted sum
|
||||
tmp2=0
|
||||
do m=1,ORDN
|
||||
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
|
||||
enddo
|
||||
|
||||
! Second dimension: y-direction weighted sum
|
||||
tmp1=0
|
||||
do m=1,ORDN
|
||||
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
|
||||
enddo
|
||||
|
||||
! Third dimension: x-direction weighted sum using BLAS DDOT
|
||||
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
|
||||
f_int=0
|
||||
do m=1,ORDN
|
||||
f_int = f_int + coef(m)*tmp1(m)
|
||||
enddo
|
||||
|
||||
return
|
||||
|
||||
@@ -1785,7 +1749,6 @@ deallocate(f_flat)
|
||||
real*8, dimension(ORDN,ORDN) :: ya
|
||||
real*8, dimension(ORDN) :: tmp1
|
||||
real*8, dimension(2) :: SoAh
|
||||
real*8, external :: DDOT
|
||||
|
||||
! +1 because c++ gives 0 for first point
|
||||
cxB = inds(1:2)+1
|
||||
@@ -1815,14 +1778,15 @@ deallocate(f_flat)
|
||||
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
|
||||
endif
|
||||
|
||||
! Optimized with BLAS operations
|
||||
tmp1=0
|
||||
do m=1,ORDN
|
||||
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
|
||||
enddo
|
||||
|
||||
! Use BLAS DDOT for final weighted sum
|
||||
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
|
||||
f_int=0
|
||||
do m=1,ORDN
|
||||
f_int = f_int + coef(m)*tmp1(m)
|
||||
enddo
|
||||
|
||||
return
|
||||
|
||||
@@ -1853,7 +1817,6 @@ deallocate(f_flat)
|
||||
real*8, dimension(ORDN) :: ya
|
||||
real*8 :: SoAh
|
||||
integer,dimension(3) :: inds
|
||||
real*8, external :: DDOT
|
||||
|
||||
! +1 because c++ gives 0 for first point
|
||||
inds = indsi + 1
|
||||
@@ -1914,8 +1877,10 @@ deallocate(f_flat)
|
||||
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
|
||||
endif
|
||||
|
||||
! Optimized with BLAS DDOT for weighted sum
|
||||
f_int = DDOT(ORDN, coef, 1, ya, 1)
|
||||
f_int=0
|
||||
do m=1,ORDN
|
||||
f_int = f_int + coef(m)*ya(m)
|
||||
enddo
|
||||
|
||||
return
|
||||
|
||||
@@ -2147,38 +2112,24 @@ deallocate(f_flat)
|
||||
|
||||
end function fWigner_d_function
|
||||
!----------------------------------
|
||||
! Optimized factorial function using lookup table for small N
|
||||
! and log-gamma for large N to avoid overflow
|
||||
function ffact(N) result(gont)
|
||||
implicit none
|
||||
integer,intent(in) :: N
|
||||
|
||||
real*8 :: gont
|
||||
integer :: i
|
||||
|
||||
! Lookup table for factorials 0! to 20! (precomputed)
|
||||
real*8, parameter, dimension(0:20) :: fact_table = [ &
|
||||
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
|
||||
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
|
||||
87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
|
||||
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
|
||||
2432902008176640000.d0 ]
|
||||
integer :: i
|
||||
|
||||
! sanity check
|
||||
if(N < 0)then
|
||||
write(*,*) "ffact: error input for factorial"
|
||||
gont = 1.d0
|
||||
return
|
||||
endif
|
||||
|
||||
! Use lookup table for small N (fast path)
|
||||
if(N <= 20)then
|
||||
gont = fact_table(N)
|
||||
else
|
||||
! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
|
||||
! This avoids overflow and is computed efficiently
|
||||
gont = exp(log_gamma(dble(N+1)))
|
||||
endif
|
||||
gont = 1.d0
|
||||
do i=1,N
|
||||
gont = gont*i
|
||||
enddo
|
||||
|
||||
return
|
||||
|
||||
@@ -2312,3 +2263,4 @@ subroutine find_maximum(ext,X,Y,Z,fun,val,pos,llb,uub)
|
||||
return
|
||||
|
||||
end subroutine
|
||||
|
||||
|
||||
@@ -16,66 +16,115 @@ using namespace std;
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
// Intel oneMKL LAPACK interface
|
||||
#include <mkl_lapacke.h>
|
||||
/* Linear equation solution using Intel oneMKL LAPACK.
|
||||
/* Linear equation solution by Gauss-Jordan elimination.
|
||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||
containing the right-hand side vectors. On output a is
|
||||
replaced by its matrix inverse, and b is replaced by the
|
||||
corresponding set of solution vectors.
|
||||
|
||||
Mathematical equivalence:
|
||||
Solves: A * x = b => x = A^(-1) * b
|
||||
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
|
||||
within numerical precision. */
|
||||
corresponding set of solution vectors */
|
||||
|
||||
int gaussj(double *a, double *b, int n)
|
||||
{
|
||||
// Allocate pivot array and workspace
|
||||
lapack_int *ipiv = new lapack_int[n];
|
||||
lapack_int info;
|
||||
double swap;
|
||||
|
||||
// Make a copy of matrix a for solving (dgesv modifies it to LU form)
|
||||
double *a_copy = new double[n * n];
|
||||
for (int i = 0; i < n * n; i++) {
|
||||
a_copy[i] = a[i];
|
||||
int *indxc, *indxr, *ipiv;
|
||||
indxc = new int[n];
|
||||
indxr = new int[n];
|
||||
ipiv = new int[n];
|
||||
|
||||
int i, icol, irow, j, k, l, ll;
|
||||
double big, dum, pivinv, temp;
|
||||
|
||||
for (j = 0; j < n; j++)
|
||||
ipiv[j] = 0;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
big = 0.0;
|
||||
for (j = 0; j < n; j++)
|
||||
if (ipiv[j] != 1)
|
||||
for (k = 0; k < n; k++)
|
||||
{
|
||||
if (ipiv[k] == 0)
|
||||
{
|
||||
if (fabs(a[j * n + k]) >= big)
|
||||
{
|
||||
big = fabs(a[j * n + k]);
|
||||
irow = j;
|
||||
icol = k;
|
||||
}
|
||||
}
|
||||
else if (ipiv[k] > 1)
|
||||
{
|
||||
cout << "gaussj: Singular Matrix-1" << endl;
|
||||
for (int ii = 0; ii < n; ii++)
|
||||
{
|
||||
for (int jj = 0; jj < n; jj++)
|
||||
cout << a[ii * n + jj] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
return 1; // error return
|
||||
}
|
||||
}
|
||||
|
||||
ipiv[icol] = ipiv[icol] + 1;
|
||||
if (irow != icol)
|
||||
{
|
||||
for (l = 0; l < n; l++)
|
||||
{
|
||||
swap = a[irow * n + l];
|
||||
a[irow * n + l] = a[icol * n + l];
|
||||
a[icol * n + l] = swap;
|
||||
}
|
||||
|
||||
swap = b[irow];
|
||||
b[irow] = b[icol];
|
||||
b[icol] = swap;
|
||||
}
|
||||
|
||||
indxr[i] = irow;
|
||||
indxc[i] = icol;
|
||||
|
||||
if (a[icol * n + icol] == 0.0)
|
||||
{
|
||||
cout << "gaussj: Singular Matrix-2" << endl;
|
||||
for (int ii = 0; ii < n; ii++)
|
||||
{
|
||||
for (int jj = 0; jj < n; jj++)
|
||||
cout << a[ii * n + jj] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
return 1; // error return
|
||||
}
|
||||
|
||||
pivinv = 1.0 / a[icol * n + icol];
|
||||
a[icol * n + icol] = 1.0;
|
||||
for (l = 0; l < n; l++)
|
||||
a[icol * n + l] *= pivinv;
|
||||
b[icol] *= pivinv;
|
||||
for (ll = 0; ll < n; ll++)
|
||||
if (ll != icol)
|
||||
{
|
||||
dum = a[ll * n + icol];
|
||||
a[ll * n + icol] = 0.0;
|
||||
for (l = 0; l < n; l++)
|
||||
a[ll * n + l] -= a[icol * n + l] * dum;
|
||||
b[ll] -= b[icol] * dum;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: Solve linear system A*x = b using LU decomposition
|
||||
// LAPACKE_dgesv uses column-major by default, but we use row-major
|
||||
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Step 2: Compute matrix inverse A^(-1) using LU factorization
|
||||
// First do LU factorization of original matrix a
|
||||
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Then compute inverse from LU factorization
|
||||
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
for (l = n - 1; l >= 0; l--)
|
||||
{
|
||||
if (indxr[l] != indxc[l])
|
||||
for (k = 0; k < n; k++)
|
||||
{
|
||||
swap = a[k * n + indxr[l]];
|
||||
a[k * n + indxr[l]] = a[k * n + indxc[l]];
|
||||
a[k * n + indxc[l]] = swap;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] indxc;
|
||||
delete[] indxr;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -512,10 +512,11 @@
|
||||
IMPLICIT DOUBLE PRECISION (A-H,O-Z)
|
||||
DIMENSION V(N),W(N)
|
||||
! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
|
||||
! Optimized using Intel oneMKL BLAS ddot
|
||||
! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
|
||||
|
||||
DOUBLE PRECISION, EXTERNAL :: DDOT
|
||||
DGVV = DDOT(N, V, 1, W, 1)
|
||||
SUM = 0.0D0
|
||||
DO 10 I = 1,N
|
||||
SUM = SUM + V(I)*W(I)
|
||||
10 CONTINUE
|
||||
DGVV = SUM
|
||||
RETURN
|
||||
END
|
||||
|
||||
@@ -1,107 +0,0 @@
|
||||
#include "interp_lb_profile.h"
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
|
||||
namespace InterpLBProfile {
|
||||
|
||||
bool write_profile(const char *filepath, int nprocs,
|
||||
const double *rank_times,
|
||||
const int *heavy_ranks, int num_heavy,
|
||||
double threshold_ratio)
|
||||
{
|
||||
FILE *fp = fopen(filepath, "wb");
|
||||
if (!fp) return false;
|
||||
|
||||
ProfileHeader hdr;
|
||||
hdr.magic = MAGIC;
|
||||
hdr.version = VERSION;
|
||||
hdr.nprocs = nprocs;
|
||||
hdr.num_heavy = num_heavy;
|
||||
hdr.threshold_ratio = threshold_ratio;
|
||||
|
||||
fwrite(&hdr, sizeof(hdr), 1, fp);
|
||||
fwrite(rank_times, sizeof(double), nprocs, fp);
|
||||
fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_profile(const char *filepath, int current_nprocs,
|
||||
int *heavy_ranks, int &num_heavy,
|
||||
double *rank_times, MPI_Comm comm)
|
||||
{
|
||||
int myrank;
|
||||
MPI_Comm_rank(comm, &myrank);
|
||||
|
||||
int valid = 0;
|
||||
ProfileHeader hdr;
|
||||
memset(&hdr, 0, sizeof(hdr));
|
||||
|
||||
if (myrank == 0) {
|
||||
FILE *fp = fopen(filepath, "rb");
|
||||
if (fp) {
|
||||
if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
|
||||
hdr.magic == MAGIC && hdr.version == VERSION &&
|
||||
hdr.nprocs == current_nprocs)
|
||||
{
|
||||
if (fread(rank_times, sizeof(double), current_nprocs, fp)
|
||||
== (size_t)current_nprocs &&
|
||||
fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
|
||||
== (size_t)hdr.num_heavy)
|
||||
{
|
||||
num_heavy = hdr.num_heavy;
|
||||
valid = 1;
|
||||
}
|
||||
} else if (fp) {
|
||||
printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
|
||||
"nprocs=%d (current=%d)\n",
|
||||
hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
|
||||
if (!valid) return false;
|
||||
|
||||
MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
|
||||
MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
|
||||
MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
|
||||
return true;
|
||||
}
|
||||
|
||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
||||
double threshold_ratio,
|
||||
int *heavy_ranks, int max_heavy)
|
||||
{
|
||||
double sum = 0;
|
||||
for (int i = 0; i < nprocs; i++) sum += rank_times[i];
|
||||
double mean = sum / nprocs;
|
||||
double threshold = threshold_ratio * mean;
|
||||
|
||||
// Collect candidates
|
||||
struct RankTime { int rank; double time; };
|
||||
RankTime *candidates = new RankTime[nprocs];
|
||||
int ncand = 0;
|
||||
|
||||
for (int i = 0; i < nprocs; i++) {
|
||||
if (rank_times[i] > threshold)
|
||||
candidates[ncand++] = {i, rank_times[i]};
|
||||
}
|
||||
|
||||
// Sort descending by time
|
||||
std::sort(candidates, candidates + ncand,
|
||||
[](const RankTime &a, const RankTime &b) {
|
||||
return a.time > b.time;
|
||||
});
|
||||
|
||||
int count = (ncand < max_heavy) ? ncand : max_heavy;
|
||||
for (int i = 0; i < count; i++)
|
||||
heavy_ranks[i] = candidates[i].rank;
|
||||
|
||||
delete[] candidates;
|
||||
return count;
|
||||
}
|
||||
|
||||
} // namespace InterpLBProfile
|
||||
Binary file not shown.
@@ -1,38 +0,0 @@
|
||||
#ifndef INTERP_LB_PROFILE_H
|
||||
#define INTERP_LB_PROFILE_H
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
namespace InterpLBProfile {
|
||||
|
||||
static const unsigned int MAGIC = 0x494C4250; // "ILBP"
|
||||
static const unsigned int VERSION = 1;
|
||||
|
||||
struct ProfileHeader {
|
||||
unsigned int magic;
|
||||
unsigned int version;
|
||||
int nprocs;
|
||||
int num_heavy;
|
||||
double threshold_ratio;
|
||||
};
|
||||
|
||||
// Write profile file (rank 0 only)
|
||||
bool write_profile(const char *filepath, int nprocs,
|
||||
const double *rank_times,
|
||||
const int *heavy_ranks, int num_heavy,
|
||||
double threshold_ratio);
|
||||
|
||||
// Read profile file (rank 0 reads, then broadcasts to all)
|
||||
// Returns true if file found and valid for current nprocs
|
||||
bool read_profile(const char *filepath, int current_nprocs,
|
||||
int *heavy_ranks, int &num_heavy,
|
||||
double *rank_times, MPI_Comm comm);
|
||||
|
||||
// Identify heavy ranks: those with time > threshold_ratio * mean
|
||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
||||
double threshold_ratio,
|
||||
int *heavy_ranks, int max_heavy);
|
||||
|
||||
} // namespace InterpLBProfile
|
||||
|
||||
#endif /* INTERP_LB_PROFILE_H */
|
||||
@@ -1,27 +0,0 @@
|
||||
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
||||
#ifndef INTERP_LB_PROFILE_DATA_H
|
||||
#define INTERP_LB_PROFILE_DATA_H
|
||||
|
||||
#define INTERP_LB_NPROCS 64
|
||||
#define INTERP_LB_NUM_HEAVY 4
|
||||
|
||||
static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
|
||||
|
||||
/* Split table: {block_id, r_left, r_right} */
|
||||
static const int interp_lb_splits[4][3] = {
|
||||
{27, 26, 27},
|
||||
{35, 34, 35},
|
||||
{28, 28, 29},
|
||||
{36, 36, 37},
|
||||
};
|
||||
|
||||
/* Rank remap for displaced neighbor blocks */
|
||||
static const int interp_lb_num_remaps = 4;
|
||||
static const int interp_lb_remaps[][2] = {
|
||||
{26, 25},
|
||||
{29, 30},
|
||||
{34, 33},
|
||||
{37, 38},
|
||||
};
|
||||
|
||||
#endif /* INTERP_LB_PROFILE_DATA_H */
|
||||
@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
|
||||
! dx^4
|
||||
|
||||
! note the sign (-1)^r-1, now r=2
|
||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
||||
!DIR$ UNROLL PARTIAL(4)
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
#include "tool.h"
|
||||
|
||||
/*
|
||||
* C 版 kodis
|
||||
*
|
||||
* Fortran signature:
|
||||
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
|
||||
*
|
||||
* 约定:
|
||||
* X: ex1, Y: ex2, Z: ex3
|
||||
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
|
||||
* SoA[3]
|
||||
* eps: double
|
||||
*/
|
||||
void kodis(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double SoA[3],
|
||||
int Symmetry, double eps)
|
||||
{
|
||||
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
|
||||
const double cof = 64.0; // 2^6
|
||||
const int NO_SYMM = 0, OCTANT = 2;
|
||||
|
||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||
|
||||
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
|
||||
|
||||
// Fortran: imax=ex(1) 等是 1-based 上界
|
||||
const int imaxF = ex1;
|
||||
const int jmaxF = ex2;
|
||||
const int kmaxF = ex3;
|
||||
|
||||
// Fortran: imin=jmin=kmin=1,某些对称情况变 -2
|
||||
int iminF = 1, jminF = 1, kminF = 1;
|
||||
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
||||
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
|
||||
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
|
||||
|
||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
|
||||
const size_t nx = (size_t)ex1 + 3;
|
||||
const size_t ny = (size_t)ex2 + 3;
|
||||
const size_t nz = (size_t)ex3 + 3;
|
||||
const size_t fh_size = nx * ny * nz;
|
||||
|
||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||
if (!fh) return;
|
||||
|
||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
||||
symmetry_bd(3, ex, f, fh, SoA);
|
||||
|
||||
/*
|
||||
* Fortran loops:
|
||||
* do k=1,ex3
|
||||
* do j=1,ex2
|
||||
* do i=1,ex1
|
||||
*
|
||||
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
||||
* 并定义 Fortran index: iF=i0+1, ...
|
||||
*/
|
||||
for (int k0 = 0; k0 < ex3; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
|
||||
// Fortran if 条件:
|
||||
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
|
||||
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
||||
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
||||
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
||||
{
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
||||
const double Dx_term =
|
||||
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
|
||||
|
||||
const double Dy_term =
|
||||
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
|
||||
|
||||
const double Dz_term =
|
||||
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
|
||||
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
|
||||
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
|
||||
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
|
||||
|
||||
// Fortran:
|
||||
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(fh);
|
||||
}
|
||||
@@ -1,255 +0,0 @@
|
||||
#include "tool.h"
|
||||
/*
|
||||
* 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。
|
||||
* Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
||||
*
|
||||
* 约定:
|
||||
* nghost = 3
|
||||
* ex[3] = {ex1,ex2,ex3}
|
||||
* f = 原始网格 (ex1*ex2*ex3)
|
||||
* fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...)
|
||||
* SoA[3] = 输入参数
|
||||
*/
|
||||
void lopsided(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||
int Symmetry, const double SoA[3])
|
||||
{
|
||||
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
|
||||
const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
|
||||
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
|
||||
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
|
||||
(void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
|
||||
|
||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||
|
||||
// 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based)
|
||||
// C: X[1]-X[0]
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
|
||||
const double d12dx = ONE / F12 / dX;
|
||||
const double d12dy = ONE / F12 / dY;
|
||||
const double d12dz = ONE / F12 / dZ;
|
||||
|
||||
// Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来)
|
||||
const double d2dx = ONE / TWO / dX;
|
||||
const double d2dy = ONE / TWO / dY;
|
||||
const double d2dz = ONE / TWO / dZ;
|
||||
(void)d2dx; (void)d2dy; (void)d2dz;
|
||||
|
||||
// Fortran:
|
||||
// imax = ex(1); jmax = ex(2); kmax = ex(3)
|
||||
const int imaxF = ex1;
|
||||
const int jmaxF = ex2;
|
||||
const int kmaxF = ex3;
|
||||
|
||||
// Fortran:
|
||||
// imin=jmin=kmin=1; 若满足对称条件则设为 -2
|
||||
int iminF = 1, jminF = 1, kminF = 1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
|
||||
|
||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3)
|
||||
const size_t nx = (size_t)ex1 + 3;
|
||||
const size_t ny = (size_t)ex2 + 3;
|
||||
const size_t nz = (size_t)ex3 + 3;
|
||||
const size_t fh_size = nx * ny * nz;
|
||||
|
||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||
if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错)
|
||||
|
||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
||||
symmetry_bd(3, ex, f, fh, SoA);
|
||||
|
||||
/*
|
||||
* Fortran 主循环:
|
||||
* do k=1,ex(3)-1
|
||||
* do j=1,ex(2)-1
|
||||
* do i=1,ex(1)-1
|
||||
*
|
||||
* 转成 C 0-based:
|
||||
* k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
|
||||
*
|
||||
* 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值:
|
||||
* iF=i0+1, jF=j0+1, kF=k0+1
|
||||
*/
|
||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||
const int kF = k0 + 1;
|
||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||
const int jF = j0 + 1;
|
||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||
const int iF = i0 + 1;
|
||||
|
||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||
|
||||
// ---------------- x direction ----------------
|
||||
const double sfx = Sfx[p];
|
||||
if (sfx > ZEO) {
|
||||
// Fortran: if(i+3 <= imax)
|
||||
// iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4
|
||||
if (i0 <= ex1 - 4) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||
}
|
||||
// elseif(i+2 <= imax) <=> i0 <= ex1-3
|
||||
else if (i0 <= ex1 - 3) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||
}
|
||||
// elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立)
|
||||
else if (i0 <= ex1 - 2) {
|
||||
f_rhs[p] -= sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||
}
|
||||
} else if (sfx < ZEO) {
|
||||
// Fortran: if(i-3 >= imin)
|
||||
// (iF-3) >= iminF <=> (i0-2) >= iminF
|
||||
if ((i0 - 2) >= iminF) {
|
||||
f_rhs[p] -= sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||
}
|
||||
// elseif(i-2 >= imin) <=> (i0-1) >= iminF
|
||||
else if ((i0 - 1) >= iminF) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||
}
|
||||
// elseif(i-1 >= imin) <=> i0 >= iminF
|
||||
else if (i0 >= iminF) {
|
||||
f_rhs[p] += sfx * d12dx *
|
||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- y direction ----------------
|
||||
const double sfy = Sfy[p];
|
||||
if (sfy > ZEO) {
|
||||
// jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
|
||||
if (j0 <= ex2 - 4) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||
} else if (j0 <= ex2 - 3) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||
} else if (j0 <= ex2 - 2) {
|
||||
f_rhs[p] -= sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||
}
|
||||
} else if (sfy < ZEO) {
|
||||
if ((j0 - 2) >= jminF) {
|
||||
f_rhs[p] -= sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||
} else if ((j0 - 1) >= jminF) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||
} else if (j0 >= jminF) {
|
||||
f_rhs[p] += sfy * d12dy *
|
||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- z direction ----------------
|
||||
const double sfz = Sfz[p];
|
||||
if (sfz > ZEO) {
|
||||
if (k0 <= ex3 - 4) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||
} else if (k0 <= ex3 - 3) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||
} else if (k0 <= ex3 - 2) {
|
||||
f_rhs[p] -= sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||
}
|
||||
} else if (sfz < ZEO) {
|
||||
if ((k0 - 2) >= kminF) {
|
||||
f_rhs[p] -= sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||
} else if ((k0 - 1) >= kminF) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||
} else if (k0 >= kminF) {
|
||||
f_rhs[p] += sfz * d12dz *
|
||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(fh);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
|
||||
|
||||
end subroutine lopsided
|
||||
|
||||
!-----------------------------------------------------------------------------
|
||||
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
|
||||
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
|
||||
! Mathematically identical to calling lopsided then kodis separately.
|
||||
!-----------------------------------------------------------------------------
|
||||
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
|
||||
implicit none
|
||||
|
||||
!~~~~~~> Input parameters:
|
||||
|
||||
integer, intent(in) :: ex(1:3),Symmetry
|
||||
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
|
||||
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
|
||||
|
||||
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
|
||||
real*8,dimension(3),intent(in) ::SoA
|
||||
real*8,intent(in) :: eps
|
||||
|
||||
!~~~~~~> local variables:
|
||||
! note index -2,-1,0, so we have 3 extra points
|
||||
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
|
||||
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
|
||||
real*8 :: dX,dY,dZ
|
||||
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
|
||||
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
|
||||
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
|
||||
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
|
||||
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
|
||||
! kodis parameters
|
||||
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
|
||||
real*8, parameter :: cof=6.4d1 ! 2^6
|
||||
|
||||
dX = X(2)-X(1)
|
||||
dY = Y(2)-Y(1)
|
||||
dZ = Z(2)-Z(1)
|
||||
|
||||
d12dx = ONE/F12/dX
|
||||
d12dy = ONE/F12/dY
|
||||
d12dz = ONE/F12/dZ
|
||||
|
||||
d2dx = ONE/TWO/dX
|
||||
d2dy = ONE/TWO/dY
|
||||
d2dz = ONE/TWO/dZ
|
||||
|
||||
imax = ex(1)
|
||||
jmax = ex(2)
|
||||
kmax = ex(3)
|
||||
|
||||
imin = 1
|
||||
jmin = 1
|
||||
kmin = 1
|
||||
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
|
||||
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
|
||||
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
|
||||
|
||||
! Single symmetry_bd call shared by both advection and dissipation
|
||||
call symmetry_bd(3,ex,f,fh,SoA)
|
||||
|
||||
! ---- Advection (lopsided) loop ----
|
||||
! upper bound set ex-1 only for efficiency,
|
||||
! the loop body will set ex 0 also
|
||||
do k=1,ex(3)-1
|
||||
do j=1,ex(2)-1
|
||||
do i=1,ex(1)-1
|
||||
! x direction
|
||||
if(Sfx(i,j,k) > ZEO)then
|
||||
if(i+3 <= imax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
|
||||
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
|
||||
elseif(i+2 <= imax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
|
||||
|
||||
elseif(i+1 <= imax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
|
||||
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
|
||||
endif
|
||||
elseif(Sfx(i,j,k) < ZEO)then
|
||||
if(i-3 >= imin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
|
||||
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
|
||||
elseif(i-2 >= imin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
|
||||
|
||||
elseif(i-1 >= imin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
|
||||
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
|
||||
endif
|
||||
endif
|
||||
|
||||
! y direction
|
||||
if(Sfy(i,j,k) > ZEO)then
|
||||
if(j+3 <= jmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
|
||||
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
|
||||
elseif(j+2 <= jmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
|
||||
|
||||
elseif(j+1 <= jmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
|
||||
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
|
||||
endif
|
||||
elseif(Sfy(i,j,k) < ZEO)then
|
||||
if(j-3 >= jmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
|
||||
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
|
||||
elseif(j-2 >= jmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
|
||||
|
||||
elseif(j-1 >= jmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
|
||||
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
|
||||
endif
|
||||
endif
|
||||
|
||||
! z direction
|
||||
if(Sfz(i,j,k) > ZEO)then
|
||||
if(k+3 <= kmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
|
||||
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
|
||||
elseif(k+2 <= kmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
|
||||
|
||||
elseif(k+1 <= kmax)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
|
||||
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
|
||||
endif
|
||||
elseif(Sfz(i,j,k) < ZEO)then
|
||||
if(k-3 >= kmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)- &
|
||||
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
|
||||
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
|
||||
elseif(k-2 >= kmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
|
||||
|
||||
elseif(k-1 >= kmin)then
|
||||
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
|
||||
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
|
||||
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
|
||||
endif
|
||||
endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
! ---- Dissipation (kodis) loop ----
|
||||
if(eps > ZEO) then
|
||||
do k=1,ex(3)
|
||||
do j=1,ex(2)
|
||||
do i=1,ex(1)
|
||||
|
||||
if(i-3 >= imin .and. i+3 <= imax .and. &
|
||||
j-3 >= jmin .and. j+3 <= jmax .and. &
|
||||
k-3 >= kmin .and. k+3 <= kmax) then
|
||||
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
|
||||
(fh(i-3,j,k)+fh(i+3,j,k)) - &
|
||||
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
|
||||
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
|
||||
TWT* fh(i,j,k) )/dX + &
|
||||
( &
|
||||
(fh(i,j-3,k)+fh(i,j+3,k)) - &
|
||||
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
|
||||
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
|
||||
TWT* fh(i,j,k) )/dY + &
|
||||
( &
|
||||
(fh(i,j,k-3)+fh(i,j,k+3)) - &
|
||||
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
|
||||
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
|
||||
TWT* fh(i,j,k) )/dZ )
|
||||
endif
|
||||
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
endif
|
||||
|
||||
return
|
||||
|
||||
end subroutine lopsided_kodis
|
||||
|
||||
#elif (ghost_width == 4)
|
||||
! sixth order code
|
||||
! Compute advection terms in right hand sides of field equations
|
||||
|
||||
@@ -1,77 +1,83 @@
|
||||
|
||||
#define tetradtype 2
|
||||
|
||||
#define Cell
|
||||
|
||||
#define ghost_width 3
|
||||
|
||||
|
||||
|
||||
#define GAUGE 0
|
||||
|
||||
#define CPBC_ghost_width (ghost_width)
|
||||
|
||||
#define ABV 0
|
||||
|
||||
#define EScalar_CC 2
|
||||
|
||||
#if 0
|
||||
|
||||
define tetradtype
|
||||
v:r; u: phi; w: theta
|
||||
tetradtype 0
|
||||
v^a = (x,y,z)
|
||||
orthonormal order: v,u,w
|
||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||
tetradtype 1
|
||||
orthonormal order: w,u,v
|
||||
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
|
||||
tetradtype 2
|
||||
v_a = (x,y,z)
|
||||
orthonormal order: v,u,w
|
||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||
|
||||
define Cell or Vertex
|
||||
Cell center or Vertex center
|
||||
|
||||
define ghost_width
|
||||
2nd order: 2
|
||||
4th order: 3
|
||||
6th order: 4
|
||||
8th order: 5
|
||||
|
||||
define WithShell
|
||||
use shell or not
|
||||
|
||||
define CPBC
|
||||
use constraint preserving boundary condition or not
|
||||
only affect Z4c
|
||||
CPBC only supports WithShell
|
||||
|
||||
define GAUGE
|
||||
0: B^i gauge
|
||||
1: David puncture gauge
|
||||
2: MB B^i gauge
|
||||
3: RIT B^i gauge
|
||||
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||
6: MGB1 B^i gauge
|
||||
7: MGB2 B^i gauge
|
||||
|
||||
define CPBC_ghost_width (ghost_width)
|
||||
buffer points for CPBC boundary
|
||||
|
||||
define ABV
|
||||
0: using BSSN variable for constraint violation and psi4 calculation
|
||||
1: using ADM variable for constraint violation and psi4 calculation
|
||||
|
||||
define EScalar_CC
|
||||
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
||||
1: Case C of 1112.3928, V=0
|
||||
2: shell with phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
|
||||
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
||||
4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
||||
5: shell with phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
note here
|
||||
v:r; u: phi; w: theta
|
||||
tetradtype 0
|
||||
v^a = (x,y,z)
|
||||
orthonormal order: v,u,w
|
||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||
tetradtype 1
|
||||
orthonormal order: w,u,v
|
||||
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
|
||||
tetradtype 2
|
||||
v_a = (x,y,z)
|
||||
orthonormal order: v,u,w
|
||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||
#endif
|
||||
#define tetradtype 2
|
||||
|
||||
#if 0
|
||||
note here
|
||||
Cell center or Vertex center
|
||||
#endif
|
||||
#define Cell
|
||||
|
||||
#if 0
|
||||
note here
|
||||
2nd order: 2
|
||||
4th order: 3
|
||||
6th order: 4
|
||||
8th order: 5
|
||||
#endif
|
||||
#define ghost_width 3
|
||||
|
||||
#if 0
|
||||
note here
|
||||
use shell or not
|
||||
#endif
|
||||
#define WithShell
|
||||
|
||||
#if 0
|
||||
note here
|
||||
use constraint preserving boundary condition or not
|
||||
only affect Z4c
|
||||
#endif
|
||||
#define CPBC
|
||||
|
||||
#if 0
|
||||
note here
|
||||
Gauge condition type
|
||||
0: B^i gauge
|
||||
1: David's puncture gauge
|
||||
2: MB B^i gauge
|
||||
3: RIT B^i gauge
|
||||
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||
6: MGB1 B^i gauge
|
||||
7: MGB2 B^i gauge
|
||||
#endif
|
||||
#define GAUGE 2
|
||||
|
||||
#if 0
|
||||
buffer points for CPBC boundary
|
||||
#endif
|
||||
#define CPBC_ghost_width (ghost_width)
|
||||
|
||||
#if 0
|
||||
using BSSN variable for constraint violation and psi4 calculation: 0
|
||||
using ADM variable for constraint violation and psi4 calculation: 1
|
||||
#endif
|
||||
#define ABV 0
|
||||
|
||||
#if 0
|
||||
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
||||
1: Case C of 1112.3928, V=0
|
||||
2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
|
||||
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
||||
4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
||||
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
|
||||
#endif
|
||||
#define EScalar_CC 2
|
||||
|
||||
|
||||
|
||||
@@ -1,145 +1,112 @@
|
||||
|
||||
#ifndef MICRODEF_H
|
||||
#define MICRODEF_H
|
||||
|
||||
#include "macrodef.fh"
|
||||
|
||||
// application parameters
|
||||
|
||||
#define SommerType 0
|
||||
|
||||
#define GaussInt
|
||||
|
||||
#define ABEtype 0
|
||||
|
||||
//#define With_AHF
|
||||
#define Psi4type 0
|
||||
|
||||
//#define Point_Psi4
|
||||
|
||||
#define RPS 1
|
||||
|
||||
#define AGM 0
|
||||
|
||||
#define RPB 0
|
||||
|
||||
#define MAPBH 1
|
||||
|
||||
#define PSTR 0
|
||||
|
||||
#define REGLEV 0
|
||||
|
||||
//#define USE_GPU
|
||||
|
||||
//#define CHECKDETAIL
|
||||
|
||||
//#define FAKECHECK
|
||||
|
||||
//
|
||||
// define SommerType
|
||||
// sommerfeld boundary type
|
||||
// 0: bam
|
||||
// 1: shibata
|
||||
//
|
||||
// define GaussInt
|
||||
// for Using Gauss-Legendre quadrature in theta direction
|
||||
//
|
||||
// define ABEtype
|
||||
// 0: BSSN vacuum
|
||||
// 1: coupled to scalar field
|
||||
// 2: Z4c vacuum
|
||||
// 3: coupled to Maxwell field
|
||||
//
|
||||
// define With_AHF
|
||||
// using Apparent Horizon Finder
|
||||
//
|
||||
// define Psi4type
|
||||
// Psi4 calculation method
|
||||
// 0: EB method
|
||||
// 1: 4-D method
|
||||
//
|
||||
// define Point_Psi4
|
||||
// for Using point psi4 or not
|
||||
//
|
||||
// define RPS
|
||||
// RestrictProlong in Step (0) or after Step (1)
|
||||
//
|
||||
// define AGM
|
||||
// Enforce algebra constraint
|
||||
// for every RK4 sub step: 0
|
||||
// only when iter_count == 3: 1
|
||||
// after routine Step: 2
|
||||
//
|
||||
// define RPB
|
||||
// Restrict Prolong using BAM style 1 or old style 0
|
||||
//
|
||||
// define MAPBH
|
||||
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
||||
//
|
||||
// define PSTR
|
||||
// parallel structure
|
||||
// 0: level by level
|
||||
// 1: considering all levels
|
||||
// 2: as 1 but reverse the CPU order
|
||||
// 3: Frank's scheme
|
||||
//
|
||||
// define REGLEV
|
||||
// regrid for every level or for all levels at a time
|
||||
// 0: for every level;
|
||||
// 1: for all
|
||||
//
|
||||
// define USE_GPU
|
||||
// use gpu or not
|
||||
//
|
||||
// define CHECKDETAIL
|
||||
// use checkpoint for every process
|
||||
//
|
||||
// define FAKECHECK
|
||||
// use FakeCheckPrepare to write CheckPoint
|
||||
//
|
||||
|
||||
////================================================================
|
||||
// some basic parameters for numerical calculation
|
||||
////================================================================
|
||||
|
||||
#define dim 3
|
||||
|
||||
//#define Cell or Vertex in "macrodef.fh"
|
||||
|
||||
#define buffer_width 6
|
||||
|
||||
#define SC_width buffer_width
|
||||
|
||||
#define CS_width (2*buffer_width)
|
||||
|
||||
//
|
||||
// define Cell or Vertex in "macrodef.fh"
|
||||
//
|
||||
// define buffer_width
|
||||
// buffer point number for mesh refinement interface
|
||||
//
|
||||
// define SC_width buffer_width
|
||||
// buffer point number shell-box interface, on shell
|
||||
//
|
||||
// define CS_width
|
||||
// buffer point number shell-box interface, on box
|
||||
//
|
||||
|
||||
#if(buffer_width < ghost_width)
|
||||
# error we always assume buffer_width>ghost_width
|
||||
#endif
|
||||
|
||||
#define PACK 1
|
||||
#define UNPACK 2
|
||||
|
||||
#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
|
||||
#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
#define feq(a,b,d) (fabs(a-b)<d)
|
||||
#define flt(a,b,d) ((a-b)<d)
|
||||
#define fgt(a,b,d) ((a-b)>d)
|
||||
|
||||
#define TINY 1e-10
|
||||
|
||||
#endif /* MICRODEF_H */
|
||||
|
||||
|
||||
#ifndef MICRODEF_H
|
||||
#define MICRODEF_H
|
||||
|
||||
#include "microdef.fh"
|
||||
|
||||
// application parameters
|
||||
|
||||
/// ****
|
||||
// sommerfeld boundary type
|
||||
// 0: bam, 1: shibata
|
||||
#define SommerType 0
|
||||
|
||||
/// ****
|
||||
// for Using Gauss-Legendre quadrature in theta direction
|
||||
#define GaussInt
|
||||
|
||||
/// ****
|
||||
// 0: BSSN vacuum
|
||||
// 1: coupled to scalar field
|
||||
// 2: Z4c vacuum
|
||||
// 3: coupled to Maxwell field
|
||||
//
|
||||
#define ABEtype 2
|
||||
|
||||
/// ****
|
||||
// using Apparent Horizon Finder
|
||||
//#define With_AHF
|
||||
|
||||
/// ****
|
||||
// Psi4 calculation method
|
||||
// 0: EB method
|
||||
// 1: 4-D method
|
||||
//
|
||||
#define Psi4type 0
|
||||
|
||||
/// ****
|
||||
// for Using point psi4 or not
|
||||
//#define Point_Psi4
|
||||
|
||||
/// ****
|
||||
// RestrictProlong in Step (0) or after Step (1)
|
||||
#define RPS 1
|
||||
|
||||
/// ****
|
||||
// Enforce algebra constraint
|
||||
// for every RK4 sub step: 0
|
||||
// only when iter_count == 3: 1
|
||||
// after routine Step: 2
|
||||
#define AGM 0
|
||||
|
||||
/// ****
|
||||
// Restrict Prolong using BAM style 1 or old style 0
|
||||
#define RPB 0
|
||||
|
||||
/// ****
|
||||
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
||||
#define MAPBH 1
|
||||
|
||||
/// ****
|
||||
// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
|
||||
#define PSTR 0
|
||||
|
||||
/// ****
|
||||
// regrid for every level or for all levels at a time
|
||||
// 0: for every level; 1: for all
|
||||
#define REGLEV 0
|
||||
|
||||
/// ****
|
||||
// use gpu or not
|
||||
//#define USE_GPU
|
||||
|
||||
/// ****
|
||||
// use checkpoint for every process
|
||||
//#define CHECKDETAIL
|
||||
|
||||
/// ****
|
||||
// use FakeCheckPrepare to write CheckPoint
|
||||
//#define FAKECHECK
|
||||
////================================================================
|
||||
// some basic parameters for numerical calculation
|
||||
#define dim 3
|
||||
|
||||
//#define Cell or Vertex in "microdef.fh"
|
||||
|
||||
// ******
|
||||
// buffer point number for mesh refinement interface
|
||||
#define buffer_width 6
|
||||
|
||||
// ******
|
||||
// buffer point number shell-box interface, on shell
|
||||
#define SC_width buffer_width
|
||||
// buffer point number shell-box interface, on box
|
||||
#define CS_width (2*buffer_width)
|
||||
|
||||
#if(buffer_width < ghost_width)
|
||||
#error we always assume buffer_width>ghost_width
|
||||
#endif
|
||||
|
||||
#define PACK 1
|
||||
#define UNPACK 2
|
||||
|
||||
#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
|
||||
#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
|
||||
|
||||
#define feq(a,b,d) (fabs(a-b)<d)
|
||||
#define flt(a,b,d) ((a-b)<d)
|
||||
#define fgt(a,b,d) ((a-b)>d)
|
||||
|
||||
#define TINY 1e-10
|
||||
|
||||
#endif /* MICRODEF_H */
|
||||
|
||||
@@ -2,27 +2,6 @@
|
||||
|
||||
include makefile.inc
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(PROFDATA) \
|
||||
-align array64byte -fpp -I${MKLROOT}/include
|
||||
endif
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
.f90.o:
|
||||
@@ -37,54 +16,13 @@ endif
|
||||
.cu.o:
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# C rewrite of BSSN RHS kernel and helpers
|
||||
bssn_rhs_c.o: bssn_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fderivs_c.o: fderivs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_c.o: fdderivs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
kodiss_c.o: kodiss_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
TwoPunctureABE.o: TwoPunctureABE.C
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
# Input files
|
||||
|
||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
||||
CFILES =
|
||||
else
|
||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
|
||||
endif
|
||||
|
||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
||||
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
||||
NullShellPatch2_Evo.o writefile_f.o
|
||||
|
||||
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
cgh.o surface_integral.o ShellPatch.o\
|
||||
@@ -94,9 +32,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
|
||||
NullShellPatch2_Evo.o \
|
||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
||||
|
||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||
rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\
|
||||
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
|
||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||
@@ -107,14 +45,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
|
||||
NullNews2.o tool_f.o
|
||||
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: include original bssn_rhs.o
|
||||
F90FILES = $(F90FILES_BASE) bssn_rhs.o
|
||||
else
|
||||
# C++ mode (default): bssn_rhs.o replaced by C++ kernel
|
||||
F90FILES = $(F90FILES_BASE)
|
||||
endif
|
||||
|
||||
F77FILES = zbesh.o
|
||||
|
||||
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
|
||||
@@ -127,7 +57,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
||||
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
||||
|
||||
# file dependences
|
||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||
$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||
|
||||
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
||||
@@ -150,7 +80,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
|
||||
|
||||
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
||||
|
||||
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||
$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||
|
||||
TwoPunctureFILES: TwoPunctures.h
|
||||
|
||||
@@ -159,14 +89,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
|
||||
misc.o : zbesh.o
|
||||
|
||||
# projects
|
||||
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||
ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||
|
||||
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||
|
||||
TwoPunctureABE: $(TwoPunctureFILES)
|
||||
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||
|
||||
clean:
|
||||
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
||||
|
||||
@@ -8,31 +8,18 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
||||
|
||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
|
||||
|
||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||
PGO_MODE ?= opt
|
||||
|
||||
## Interp_Points load balance profiling mode
|
||||
## off : (default) no load balance instrumentation
|
||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
||||
## optimize : Pass 2 — read profile and apply block rebalancing
|
||||
INTERP_LB_MODE ?= off
|
||||
|
||||
ifeq ($(INTERP_LB_MODE),profile)
|
||||
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
|
||||
else ifeq ($(INTERP_LB_MODE),optimize)
|
||||
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
|
||||
else
|
||||
INTERP_LB_FLAGS =
|
||||
endif
|
||||
|
||||
## Kernel implementation switch
|
||||
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
||||
## 0 : fall back to original Fortran kernels
|
||||
USE_CXX_KERNELS ?= 1
|
||||
## Aggressive optimization flags:
|
||||
## -O3: Maximum optimization
|
||||
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
|
||||
## -fp-model fast=2: Aggressive floating-point optimizations
|
||||
## -fma: Enable fused multiply-add instructions
|
||||
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
|
||||
-fpp -I${MKLROOT}/include
|
||||
f90 = ifx
|
||||
f77 = ifx
|
||||
CXX = icpx
|
||||
@@ -43,3 +30,4 @@ Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
|
||||
|
||||
@@ -1,146 +0,0 @@
|
||||
#ifndef SHARE_FUNC_H
|
||||
#define SHARE_FUNC_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
/* 主网格:0-based -> 1D */
|
||||
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
|
||||
const int ex1 = ex[0], ex2 = ex[1];
|
||||
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
|
||||
}
|
||||
|
||||
/*
|
||||
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
|
||||
* ord=2 => shift=1
|
||||
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex)
|
||||
*/
|
||||
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
|
||||
const int shift = 1;
|
||||
const int nx = ex[0] + 2; // ex1 + ord
|
||||
const int ny = ex[1] + 2;
|
||||
|
||||
const int ii = iF + shift; // 0..ex1+1
|
||||
const int jj = jF + shift; // 0..ex2+1
|
||||
const int kk = kF + shift; // 0..ex3+1
|
||||
|
||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
||||
}
|
||||
|
||||
/*
|
||||
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
|
||||
* ord=3 => shift=2
|
||||
* iF/jF/kF 是 Fortran 索引(可为负)
|
||||
*/
|
||||
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
|
||||
const int shift = 2; // ord=3 -> -2..ex
|
||||
const int nx = ex[0] + 3; // ex1 + ord
|
||||
const int ny = ex[1] + 3;
|
||||
|
||||
const int ii = iF + shift; // 0..ex1+2
|
||||
const int jj = jF + shift; // 0..ex2+2
|
||||
const int kk = kF + shift; // 0..ex3+2
|
||||
|
||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
||||
}
|
||||
|
||||
/*
|
||||
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
|
||||
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
|
||||
*
|
||||
* C 里我们把:
|
||||
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
|
||||
* funcc 用“平移下标”存为一维数组:
|
||||
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
|
||||
* 总长度 nx = extc1 + ord
|
||||
* 同理 ny = extc2 + ord, nz = extc3 + ord
|
||||
*/
|
||||
|
||||
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
|
||||
const int nx = extc[0], ny = extc[1];
|
||||
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
|
||||
}
|
||||
|
||||
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
|
||||
const int shift = ord - 1; // iF = -shift .. extc1
|
||||
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
|
||||
const int ny = extc[1] + ord;
|
||||
|
||||
const int ii = iF + shift; // 0..extc1+shift
|
||||
const int jj = jF + shift; // 0..extc2+shift
|
||||
const int kk = kF + shift; // 0..extc3+shift
|
||||
|
||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
||||
}
|
||||
|
||||
/*
|
||||
* 等价于 Fortran:
|
||||
* funcc(1:extc1,1:extc2,1:extc3)=func
|
||||
* do i=0,ord-1
|
||||
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
|
||||
* enddo
|
||||
* do i=0,ord-1
|
||||
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
|
||||
* enddo
|
||||
* do i=0,ord-1
|
||||
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
||||
* enddo
|
||||
*/
|
||||
static inline void symmetry_bd(int ord,
|
||||
const int extc[3],
|
||||
const double *func,
|
||||
double *funcc,
|
||||
const double SoA[3])
|
||||
{
|
||||
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
|
||||
|
||||
// 1) funcc(1:extc1,1:extc2,1:extc3) = func
|
||||
// Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
|
||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
||||
for (int i0 = 0; i0 < extc1; ++i0) {
|
||||
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
||||
funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
|
||||
for (int ii = 0; ii <= ord - 1; ++ii) {
|
||||
const int iF_dst = -ii; // 0, -1, -2, ...
|
||||
const int iF_src = ii + 1; // 1, 2, 3, ...
|
||||
for (int kF = 1; kF <= extc3; ++kF) {
|
||||
for (int jF = 1; jF <= extc2; ++jF) {
|
||||
funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
|
||||
funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
|
||||
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
|
||||
for (int jj = 0; jj <= ord - 1; ++jj) {
|
||||
const int jF_dst = -jj;
|
||||
const int jF_src = jj + 1;
|
||||
for (int kF = 1; kF <= extc3; ++kF) {
|
||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
||||
funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
|
||||
funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
|
||||
for (int kk = 0; kk <= ord - 1; ++kk) {
|
||||
const int kF_dst = -kk;
|
||||
const int kF_src = kk + 1;
|
||||
for (int jF = -ord + 1; jF <= extc2; ++jF) {
|
||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
||||
funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
|
||||
funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -220,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
|
||||
pox[2][n] = rex * nz_g[n];
|
||||
}
|
||||
|
||||
double *shellf;
|
||||
shellf = new double[n_tot * InList];
|
||||
|
||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
|
||||
|
||||
int mp, Lp, Nmin, Nmax;
|
||||
|
||||
mp = n_tot / cpusize;
|
||||
Lp = n_tot - cpusize * mp;
|
||||
|
||||
if (Lp > myrank)
|
||||
{
|
||||
Nmin = myrank * mp + myrank;
|
||||
@@ -234,11 +241,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
|
||||
Nmax = Nmin + mp - 1;
|
||||
}
|
||||
|
||||
double *shellf;
|
||||
shellf = new double[n_tot * InList];
|
||||
|
||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
|
||||
|
||||
//|~~~~~> Integrate the dot product of Dphi with the surface normal.
|
||||
|
||||
double *RP_out, *IP_out;
|
||||
@@ -361,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -563,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -751,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -1009,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -1453,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -1897,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -2092,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -2287,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
@@ -2384,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
|
||||
pox[2][n] = rex * nz_g[n];
|
||||
}
|
||||
|
||||
double *shellf;
|
||||
shellf = new double[n_tot * InList];
|
||||
|
||||
// we have assumed there is only one box on this level,
|
||||
// so we do not need loop boxes
|
||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
|
||||
|
||||
double Mass_out = 0;
|
||||
double ang_outx, ang_outy, ang_outz;
|
||||
double p_outx, p_outy, p_outz;
|
||||
ang_outx = ang_outy = ang_outz = 0.0;
|
||||
p_outx = p_outy = p_outz = 0.0;
|
||||
const double f1o8 = 0.125;
|
||||
|
||||
int mp, Lp, Nmin, Nmax;
|
||||
|
||||
mp = n_tot / cpusize;
|
||||
Lp = n_tot - cpusize * mp;
|
||||
|
||||
if (Lp > myrank)
|
||||
{
|
||||
Nmin = myrank * mp + myrank;
|
||||
@@ -2398,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
|
||||
Nmax = Nmin + mp - 1;
|
||||
}
|
||||
|
||||
double *shellf;
|
||||
shellf = new double[n_tot * InList];
|
||||
|
||||
// we have assumed there is only one box on this level,
|
||||
// so we do not need loop boxes
|
||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
|
||||
|
||||
double Mass_out = 0;
|
||||
double ang_outx, ang_outy, ang_outz;
|
||||
double p_outx, p_outy, p_outz;
|
||||
ang_outx = ang_outy = ang_outz = 0.0;
|
||||
p_outx = p_outy = p_outz = 0.0;
|
||||
const double f1o8 = 0.125;
|
||||
|
||||
double Chi, Psi;
|
||||
double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
|
||||
double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
|
||||
@@ -2532,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
|
||||
double scalar_in[7];
|
||||
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
|
||||
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
|
||||
}
|
||||
MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
#ifdef GaussInt
|
||||
mass = mass * rex * rex * dphi * factor;
|
||||
@@ -2801,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
|
||||
double scalar_in[7];
|
||||
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
|
||||
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
|
||||
}
|
||||
MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
|
||||
MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
|
||||
MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
|
||||
|
||||
#ifdef GaussInt
|
||||
mass = mass * rex * rex * dphi * factor;
|
||||
@@ -3084,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
|
||||
double scalar_in[7];
|
||||
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
|
||||
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
|
||||
}
|
||||
MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
#ifdef GaussInt
|
||||
mass = mass * rex * rex * dphi * factor;
|
||||
@@ -3669,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
|
||||
}
|
||||
//|------+ Communicate and sum the results from each processor.
|
||||
|
||||
{
|
||||
double *RPIP_out = new double[2 * NN];
|
||||
double *RPIP = new double[2 * NN];
|
||||
memcpy(RPIP_out, RP_out, NN * sizeof(double));
|
||||
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
|
||||
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
memcpy(RP, RPIP, NN * sizeof(double));
|
||||
memcpy(IP, RPIP + NN, NN * sizeof(double));
|
||||
delete[] RPIP_out;
|
||||
delete[] RPIP;
|
||||
}
|
||||
MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
//|------= Free memory.
|
||||
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#include "share_func.h"
|
||||
void fdderivs(const int ex[3],
|
||||
const double *f,
|
||||
double *fxx, double *fxy, double *fxz,
|
||||
double *fyy, double *fyz, double *fzz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double SYM1, double SYM2, double SYM3,
|
||||
int Symmetry, int onoff);
|
||||
|
||||
void fderivs(const int ex[3],
|
||||
const double *f,
|
||||
double *fx, double *fy, double *fz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
double SYM1, double SYM2, double SYM3,
|
||||
int Symmetry, int onoff);
|
||||
|
||||
void kodis(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double SoA[3],
|
||||
int Symmetry, double eps);
|
||||
|
||||
void lopsided(const int ex[3],
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *f, double *f_rhs,
|
||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||
int Symmetry, const double SoA[3]);
|
||||
@@ -1,72 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
|
||||
import struct, sys
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
|
||||
sys.exit(1)
|
||||
|
||||
with open(sys.argv[1], 'rb') as f:
|
||||
magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
|
||||
threshold = struct.unpack('d', f.read(8))[0]
|
||||
times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
|
||||
heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
|
||||
|
||||
# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
|
||||
# (or vice versa depending on which neighbor is lighter)
|
||||
splits = []
|
||||
for hr in heavy:
|
||||
prev_t = times[hr - 1] if hr > 0 else 1e30
|
||||
next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
|
||||
if prev_t <= next_t:
|
||||
splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right)
|
||||
else:
|
||||
splits.append((hr, hr, hr + 1))
|
||||
|
||||
# Also remap the displaced neighbor blocks
|
||||
remaps = {}
|
||||
for hr, r_l, r_r in splits:
|
||||
if r_l != hr:
|
||||
# We took r_l's slot, so remap block r_l to its other neighbor
|
||||
displaced = r_l
|
||||
if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
|
||||
remaps[displaced] = displaced - 1
|
||||
elif displaced < nprocs - 1:
|
||||
remaps[displaced] = displaced + 1
|
||||
else:
|
||||
displaced = r_r
|
||||
if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
|
||||
remaps[displaced] = displaced + 1
|
||||
elif displaced > 0:
|
||||
remaps[displaced] = displaced - 1
|
||||
|
||||
with open(sys.argv[2], 'w') as out:
|
||||
out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
|
||||
out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
|
||||
out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
|
||||
out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
|
||||
out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
|
||||
out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
|
||||
out.write(", ".join(str(h) for h in heavy))
|
||||
out.write("};\n\n")
|
||||
out.write("/* Split table: {block_id, r_left, r_right} */\n")
|
||||
out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
|
||||
for bid, rl, rr in splits:
|
||||
out.write(f" {{{bid}, {rl}, {rr}}},\n")
|
||||
out.write("};\n\n")
|
||||
out.write("/* Rank remap for displaced neighbor blocks */\n")
|
||||
out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
|
||||
out.write(f"static const int interp_lb_remaps[][2] = {{\n")
|
||||
for src, dst in sorted(remaps.items()):
|
||||
out.write(f" {{{src}, {dst}}},\n")
|
||||
if not remaps:
|
||||
out.write(" {-1, -1},\n")
|
||||
out.write("};\n\n")
|
||||
out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
|
||||
|
||||
print(f"Generated {sys.argv[2]}:")
|
||||
print(f" {num_heavy} heavy blocks to split: {heavy}")
|
||||
for bid, rl, rr in splits:
|
||||
print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)")
|
||||
for src, dst in sorted(remaps.items()):
|
||||
print(f" block {src}: remap -> rank {dst}")
|
||||
@@ -10,47 +10,6 @@
|
||||
|
||||
import AMSS_NCKU_Input as input_data
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def get_last_n_cores_per_socket(n=32):
|
||||
"""
|
||||
Read CPU topology via lscpu and return a taskset -c string
|
||||
selecting the last `n` cores of each NUMA node (socket).
|
||||
|
||||
Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
|
||||
-> "taskset -c 24-55,80-111"
|
||||
"""
|
||||
result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
|
||||
|
||||
# Build a dict: node_id -> sorted list of CPU ids
|
||||
node_cpus = {}
|
||||
for line in result.stdout.splitlines():
|
||||
if line.startswith("#") or not line.strip():
|
||||
continue
|
||||
parts = line.split(",")
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
node_id, cpu_id = int(parts[0]), int(parts[1])
|
||||
node_cpus.setdefault(node_id, []).append(cpu_id)
|
||||
|
||||
segments = []
|
||||
for node_id in sorted(node_cpus):
|
||||
cpus = sorted(node_cpus[node_id])
|
||||
selected = cpus[-n:] # last n cores of this socket
|
||||
segments.append(f"{selected[0]}-{selected[-1]}")
|
||||
|
||||
cpu_str = ",".join(segments)
|
||||
total = len(segments) * n
|
||||
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
||||
return f"taskset -c {cpu_str}"
|
||||
|
||||
|
||||
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
||||
NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
|
||||
|
||||
## Build parallelism: match the number of bound cores
|
||||
BUILD_JOBS = 64
|
||||
|
||||
|
||||
##################################################################
|
||||
@@ -67,11 +26,11 @@ def makefile_ABE():
|
||||
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
|
||||
print( )
|
||||
|
||||
## Build command with CPU binding to nohz_full cores
|
||||
## Build command
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
|
||||
makefile_command = "make -j4" + " ABE"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||
makefile_command = "make -j4" + " ABEGPU"
|
||||
else:
|
||||
print( " CPU/GPU numerical calculation setting is wrong " )
|
||||
print( )
|
||||
@@ -108,8 +67,8 @@ def makefile_TwoPunctureABE():
|
||||
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
|
||||
print( )
|
||||
|
||||
## Build command with CPU binding to nohz_full cores
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
|
||||
## Build command
|
||||
makefile_command = "make" + " TwoPunctureABE"
|
||||
|
||||
## Execute the command with subprocess.Popen and stream output
|
||||
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
||||
@@ -146,11 +105,10 @@ def run_ABE():
|
||||
## Define the command to run; cast other values to strings as needed
|
||||
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command_outfile = "ABE_out.log"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command_outfile = "ABEGPU_out.log"
|
||||
|
||||
## Execute the MPI command and stream output
|
||||
@@ -183,14 +141,13 @@ def run_ABE():
|
||||
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
|
||||
|
||||
def run_TwoPunctureABE():
|
||||
tp_time1=time.time()
|
||||
|
||||
print( )
|
||||
print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
|
||||
print( )
|
||||
|
||||
## Define the command to run
|
||||
#TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
|
||||
TwoPuncture_command = " ./TwoPunctureABE"
|
||||
TwoPuncture_command = "./TwoPunctureABE"
|
||||
TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
|
||||
|
||||
## Execute the command with subprocess.Popen and stream output
|
||||
@@ -211,9 +168,7 @@ def run_TwoPunctureABE():
|
||||
print( )
|
||||
print( " The TwoPunctureABE simulation is finished " )
|
||||
print( )
|
||||
tp_time2=time.time()
|
||||
et=tp_time2-tp_time1
|
||||
print(f"Used time: {et}")
|
||||
|
||||
return
|
||||
|
||||
##################################################################
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
import multiprocessing
|
||||
|
||||
def run_plot_task(task):
|
||||
"""Execute a single plotting task.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
task : tuple
|
||||
A tuple of (function, args_tuple) where function is a callable
|
||||
plotting function and args_tuple contains its arguments.
|
||||
"""
|
||||
func, args = task
|
||||
return func(*args)
|
||||
|
||||
|
||||
def run_plot_tasks_parallel(plot_tasks):
|
||||
"""Execute a list of independent plotting tasks in parallel.
|
||||
|
||||
Uses the 'fork' context to create worker processes so that the main
|
||||
script is NOT re-imported/re-executed in child processes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
plot_tasks : list of tuples
|
||||
Each element is (function, args_tuple).
|
||||
"""
|
||||
ctx = multiprocessing.get_context('fork')
|
||||
with ctx.Pool() as pool:
|
||||
pool.map(run_plot_task, plot_tasks)
|
||||
@@ -1,97 +0,0 @@
|
||||
# AMSS-NCKU PGO Profile Analysis Report
|
||||
|
||||
## 1. Profiling Environment
|
||||
|
||||
| Item | Value |
|
||||
|------|-------|
|
||||
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
|
||||
| Instrumentation Flag | `-fprofile-instr-generate` |
|
||||
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
|
||||
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
|
||||
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
|
||||
| Merged Profile | `default.profdata` (394 KB) |
|
||||
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
|
||||
|
||||
## 2. Reduced Simulation Parameters (for profiling run)
|
||||
|
||||
| Parameter | Production Value | Profiling Value |
|
||||
|-----------|-----------------|-----------------|
|
||||
| MPI_processes | 64 | 1 |
|
||||
| grid_level | 9 | 4 |
|
||||
| static_grid_level | 5 | 3 |
|
||||
| static_grid_number | 96 | 24 |
|
||||
| moving_grid_number | 48 | 16 |
|
||||
| largest_box_xyz_max | 320^3 | 160^3 |
|
||||
| Final_Evolution_Time | 1000.0 | 10.0 |
|
||||
| Evolution_Step_Number | 10,000,000 | 1,000 |
|
||||
| Detector_Number | 12 | 2 |
|
||||
|
||||
## 3. Profile Summary
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Total instrumented functions | 1,392 |
|
||||
| Functions with non-zero counts | 117 (8.4%) |
|
||||
| Functions with zero counts | 1,275 (91.6%) |
|
||||
| Maximum function entry count | 386,459,248 |
|
||||
| Maximum internal block count | 370,477,680 |
|
||||
| Total block count | 4,198,023,118 |
|
||||
|
||||
## 4. Top 20 Hotspot Functions
|
||||
|
||||
| Rank | Total Count | Max Block Count | Function | Category |
|
||||
|------|------------|-----------------|----------|----------|
|
||||
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
|
||||
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
|
||||
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
|
||||
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
|
||||
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
|
||||
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
|
||||
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
|
||||
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
|
||||
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
|
||||
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
|
||||
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
|
||||
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
|
||||
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
|
||||
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
|
||||
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
|
||||
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
|
||||
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
|
||||
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
|
||||
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
|
||||
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
|
||||
|
||||
## 5. Hotspot Category Breakdown
|
||||
|
||||
Top 20 functions account for ~98% of total execution counts:
|
||||
|
||||
| Category | Functions | Combined Count | Share |
|
||||
|----------|-----------|---------------|-------|
|
||||
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
|
||||
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
|
||||
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
|
||||
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
|
||||
| Dissipation | kodis_ | ~92M | ~2% |
|
||||
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
|
||||
|
||||
## 6. Conclusions
|
||||
|
||||
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
|
||||
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
|
||||
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
|
||||
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
|
||||
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
|
||||
|
||||
## 7. PGO Phase 2 Usage
|
||||
|
||||
To apply the profile, use the following flags in `makefile.inc`:
|
||||
|
||||
```makefile
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
||||
-align array64byte -fpp -I${MKLROOT}/include
|
||||
```
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -11,8 +11,6 @@
|
||||
import numpy ## numpy for array operations
|
||||
import scipy ## scipy for interpolation and signal processing
|
||||
import math
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
|
||||
import matplotlib.pyplot as plt ## matplotlib for plotting
|
||||
import os ## os for system/file operations
|
||||
|
||||
|
||||
@@ -8,23 +8,16 @@
|
||||
##
|
||||
#################################################
|
||||
|
||||
## Restrict OpenMP to one thread per process so that running
|
||||
## many workers in parallel does not create an O(workers * BLAS_threads)
|
||||
## thread explosion. The variable MUST be set before numpy/scipy
|
||||
## are imported, because the BLAS library reads them only at load time.
|
||||
import os
|
||||
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
||||
|
||||
import numpy
|
||||
import scipy
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.colors import LogNorm
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
## import torch
|
||||
import AMSS_NCKU_Input as input_data
|
||||
|
||||
import os
|
||||
|
||||
|
||||
#########################################################################################
|
||||
|
||||
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
|
||||
|
||||
####################################################################################
|
||||
|
||||
|
||||
####################################################################################
|
||||
## Allow this module to be run as a standalone script so that each
|
||||
## binary-data plot can be executed in a fresh subprocess whose BLAS
|
||||
## environment variables (set above) take effect before numpy loads.
|
||||
##
|
||||
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
|
||||
####################################################################################
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) != 4:
|
||||
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
|
||||
sys.exit(1)
|
||||
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
#################################################
|
||||
|
||||
import numpy ## numpy for array operations
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
|
||||
import matplotlib.pyplot as plt ## matplotlib for plotting
|
||||
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
|
||||
import glob
|
||||
@@ -17,9 +15,6 @@ import os ## operating system utilities
|
||||
|
||||
import plot_binary_data
|
||||
import AMSS_NCKU_Input as input_data
|
||||
import subprocess
|
||||
import sys
|
||||
import multiprocessing
|
||||
|
||||
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
|
||||
|
||||
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
|
||||
file_list.append(x)
|
||||
print(x)
|
||||
|
||||
## Plot each file in parallel using subprocesses.
|
||||
## Each subprocess is a fresh Python process where the BLAS thread-count
|
||||
## environment variables (set at the top of plot_binary_data.py) take
|
||||
## effect before numpy is imported. This avoids the thread explosion
|
||||
## that occurs when multiprocessing.Pool with 'fork' context inherits
|
||||
## already-initialized multi-threaded BLAS from the parent.
|
||||
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
|
||||
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
|
||||
|
||||
running = []
|
||||
failed = []
|
||||
## Plot each file in the list
|
||||
for filename in file_list:
|
||||
print(filename)
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, script, filename, binary_outdir, figure_outdir],
|
||||
)
|
||||
running.append( (proc, filename) )
|
||||
## Keep at most max_workers subprocesses active at a time
|
||||
if len(running) >= max_workers:
|
||||
p, fn = running.pop(0)
|
||||
p.wait()
|
||||
if p.returncode != 0:
|
||||
failed.append(fn)
|
||||
|
||||
## Wait for all remaining subprocesses to finish
|
||||
for p, fn in running:
|
||||
p.wait()
|
||||
if p.returncode != 0:
|
||||
failed.append(fn)
|
||||
|
||||
if failed:
|
||||
print( " WARNING: the following binary data plots failed:" )
|
||||
for fn in failed:
|
||||
print( " ", fn )
|
||||
plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
|
||||
|
||||
print( )
|
||||
print( " Binary Data Plot Has been Finished " )
|
||||
|
||||
Reference in New Issue
Block a user